amd: Move all amd/common code that depends on LLVM to amd/llvm.

This commit is a step towards the goal of being able to build RADV without LLVM. In the future we would like to offer the option to use RADV solely with ACO. There is still a need for the common AMD code located in amd/common but the LLVM specific parts need to be separated. Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Bas Nieuwenhuizen <[email protected]> Acked-by: Marek Olšák <[email protected]> Acked-by: Samuel Pitoiset <[email protected]>
author: Timur Kristóf <[email protected]> 2019-09-27 10:29:51 +0200
committer: Bas Nieuwenhuizen <[email protected]> 2019-10-08 00:44:08 +0000
commit: 3a08110d43ce268747d034cae03787080967bf71 (patch)
tree: ee02037354a4a3846f9cb75df486b036345a5acc /src/amd/llvm
parent: 738bbee603fd3fd8ea29edab7b681e48bc981467 (diff)
11 files changed, 11675 insertions, 0 deletions
diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
new file mode 100644
index 00000000000..cda2daab6f5
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -0,0 +1,4478 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+/* based on pieces from si_pipe.c and radeon_llvm_emit.c */
+#include "ac_llvm_build.h"
+
+#include <llvm-c/Core.h>
+#include <llvm/Config/llvm-config.h>
+
+#include "c11/threads.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "ac_llvm_util.h"
+#include "ac_shader_util.h"
+#include "ac_exp_param.h"
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "util/u_atomic.h"
+#include "util/u_math.h"
+#include "sid.h"
+
+#include "shader_enums.h"
+
+#define AC_LLVM_INITIAL_CF_DEPTH 4
+
+/* Data for if/else/endif and bgnloop/endloop control flow structures.
+ */
+struct ac_llvm_flow {
+	/* Loop exit or next part of if/else/endif. */
+	LLVMBasicBlockRef next_block;
+	LLVMBasicBlockRef loop_entry_block;
+};
+
+/* Initialize module-independent parts of the context.
+ *
+ * The caller is responsible for initializing ctx::module and ctx::builder.
+ */
+void
+ac_llvm_context_init(struct ac_llvm_context *ctx,
+		     struct ac_llvm_compiler *compiler,
+		     enum chip_class chip_class, enum radeon_family family,
+		     enum ac_float_mode float_mode, unsigned wave_size,
+		     unsigned ballot_mask_bits)
+{
+	LLVMValueRef args[1];
+
+	ctx->context = LLVMContextCreate();
+
+	ctx->chip_class = chip_class;
+	ctx->family = family;
+	ctx->wave_size = wave_size;
+	ctx->ballot_mask_bits = ballot_mask_bits;
+	ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
+						       : compiler->tm,
+				       ctx->context);
+	ctx->builder = ac_create_builder(ctx->context, float_mode);
+
+	ctx->voidt = LLVMVoidTypeInContext(ctx->context);
+	ctx->i1 = LLVMInt1TypeInContext(ctx->context);
+	ctx->i8 = LLVMInt8TypeInContext(ctx->context);
+	ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
+	ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
+	ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
+	ctx->intptr = ctx->i32;
+	ctx->f16 = LLVMHalfTypeInContext(ctx->context);
+	ctx->f32 = LLVMFloatTypeInContext(ctx->context);
+	ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
+	ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
+	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
+	ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
+	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
+	ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
+	ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
+	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
+	ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
+	ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
+	ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
+
+	ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
+	ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
+	ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
+	ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
+	ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
+	ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
+	ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
+	ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
+	ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
+	ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
+	ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
+	ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
+	ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
+	ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
+
+	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
+	ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
+
+	ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
+						     "range", 5);
+
+	ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
+							       "invariant.load", 14);
+
+	ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
+
+	args[0] = LLVMConstReal(ctx->f32, 2.5);
+	ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
+
+	ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
+							"amdgpu.uniform", 14);
+
+	ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
+	ctx->flow = calloc(1, sizeof(*ctx->flow));
+}
+
+void
+ac_llvm_context_dispose(struct ac_llvm_context *ctx)
+{
+	free(ctx->flow->stack);
+	free(ctx->flow);
+	ctx->flow = NULL;
+}
+
+int
+ac_get_llvm_num_components(LLVMValueRef value)
+{
+	LLVMTypeRef type = LLVMTypeOf(value);
+	unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
+	                              ? LLVMGetVectorSize(type)
+	                              : 1;
+	return num_components;
+}
+
+LLVMValueRef
+ac_llvm_extract_elem(struct ac_llvm_context *ac,
+		     LLVMValueRef value,
+		     int index)
+{
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
+		assert(index == 0);
+		return value;
+	}
+
+	return LLVMBuildExtractElement(ac->builder, value,
+				       LLVMConstInt(ac->i32, index, false), "");
+}
+
+int
+ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
+{
+	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
+		type = LLVMGetElementType(type);
+
+	if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
+		return LLVMGetIntTypeWidth(type);
+
+	if (type == ctx->f16)
+		return 16;
+	if (type == ctx->f32)
+		return 32;
+	if (type == ctx->f64)
+		return 64;
+
+	unreachable("Unhandled type kind in get_elem_bits");
+}
+
+unsigned
+ac_get_type_size(LLVMTypeRef type)
+{
+	LLVMTypeKind kind = LLVMGetTypeKind(type);
+
+	switch (kind) {
+	case LLVMIntegerTypeKind:
+		return LLVMGetIntTypeWidth(type) / 8;
+	case LLVMHalfTypeKind:
+		return 2;
+	case LLVMFloatTypeKind:
+		return 4;
+	case LLVMDoubleTypeKind:
+		return 8;
+	case LLVMPointerTypeKind:
+		if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
+			return 4;
+		return 8;
+	case LLVMVectorTypeKind:
+		return LLVMGetVectorSize(type) *
+		       ac_get_type_size(LLVMGetElementType(type));
+	case LLVMArrayTypeKind:
+		return LLVMGetArrayLength(type) *
+		       ac_get_type_size(LLVMGetElementType(type));
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
+static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
+{
+	if (t == ctx->i8)
+		return ctx->i8;
+	else if (t == ctx->f16 || t == ctx->i16)
+		return ctx->i16;
+	else if (t == ctx->f32 || t == ctx->i32)
+		return ctx->i32;
+	else if (t == ctx->f64 || t == ctx->i64)
+		return ctx->i64;
+	else
+		unreachable("Unhandled integer size");
+}
+
+LLVMTypeRef
+ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
+{
+	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
+		LLVMTypeRef elem_type = LLVMGetElementType(t);
+		return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
+		                      LLVMGetVectorSize(t));
+	}
+	if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
+		switch (LLVMGetPointerAddressSpace(t)) {
+		case AC_ADDR_SPACE_GLOBAL:
+			return ctx->i64;
+		case AC_ADDR_SPACE_LDS:
+			return ctx->i32;
+		default:
+			unreachable("unhandled address space");
+		}
+	}
+	return to_integer_type_scalar(ctx, t);
+}
+
+LLVMValueRef
+ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+	LLVMTypeRef type = LLVMTypeOf(v);
+	if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+		return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
+	}
+	return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
+}
+
+LLVMValueRef
+ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+	LLVMTypeRef type = LLVMTypeOf(v);
+	if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
+		return v;
+	return ac_to_integer(ctx, v);
+}
+
+static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
+{
+	if (t == ctx->i8)
+		return ctx->i8;
+	else if (t == ctx->i16 || t == ctx->f16)
+		return ctx->f16;
+	else if (t == ctx->i32 || t == ctx->f32)
+		return ctx->f32;
+	else if (t == ctx->i64 || t == ctx->f64)
+		return ctx->f64;
+	else
+		unreachable("Unhandled float size");
+}
+
+LLVMTypeRef
+ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
+{
+	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
+		LLVMTypeRef elem_type = LLVMGetElementType(t);
+		return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
+		                      LLVMGetVectorSize(t));
+	}
+	return to_float_type_scalar(ctx, t);
+}
+
+LLVMValueRef
+ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+	LLVMTypeRef type = LLVMTypeOf(v);
+	return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
+}
+
+
+LLVMValueRef
+ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
+		   LLVMTypeRef return_type, LLVMValueRef *params,
+		   unsigned param_count, unsigned attrib_mask)
+{
+	LLVMValueRef function, call;
+	bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
+
+	function = LLVMGetNamedFunction(ctx->module, name);
+	if (!function) {
+		LLVMTypeRef param_types[32], function_type;
+		unsigned i;
+
+		assert(param_count <= 32);
+
+		for (i = 0; i < param_count; ++i) {
+			assert(params[i]);
+			param_types[i] = LLVMTypeOf(params[i]);
+		}
+		function_type =
+		    LLVMFunctionType(return_type, param_types, param_count, 0);
+		function = LLVMAddFunction(ctx->module, name, function_type);
+
+		LLVMSetFunctionCallConv(function, LLVMCCallConv);
+		LLVMSetLinkage(function, LLVMExternalLinkage);
+
+		if (!set_callsite_attrs)
+			ac_add_func_attributes(ctx->context, function, attrib_mask);
+	}
+
+	call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
+	if (set_callsite_attrs)
+		ac_add_func_attributes(ctx->context, call, attrib_mask);
+	return call;
+}
+
+/**
+ * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
+ * intrinsic names).
+ */
+void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
+{
+	LLVMTypeRef elem_type = type;
+
+	assert(bufsize >= 8);
+
+	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
+		int ret = snprintf(buf, bufsize, "v%u",
+					LLVMGetVectorSize(type));
+		if (ret < 0) {
+			char *type_name = LLVMPrintTypeToString(type);
+			fprintf(stderr, "Error building type name for: %s\n",
+				type_name);
+			LLVMDisposeMessage(type_name);
+			return;
+		}
+		elem_type = LLVMGetElementType(type);
+		buf += ret;
+		bufsize -= ret;
+	}
+	switch (LLVMGetTypeKind(elem_type)) {
+	default: break;
+	case LLVMIntegerTypeKind:
+		snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
+		break;
+	case LLVMHalfTypeKind:
+		snprintf(buf, bufsize, "f16");
+		break;
+	case LLVMFloatTypeKind:
+		snprintf(buf, bufsize, "f32");
+		break;
+	case LLVMDoubleTypeKind:
+		snprintf(buf, bufsize, "f64");
+		break;
+	}
+}
+
+/**
+ * Helper function that builds an LLVM IR PHI node and immediately adds
+ * incoming edges.
+ */
+LLVMValueRef
+ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+	     unsigned count_incoming, LLVMValueRef *values,
+	     LLVMBasicBlockRef *blocks)
+{
+	LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
+	LLVMAddIncoming(phi, values, blocks, count_incoming);
+	return phi;
+}
+
+void ac_build_s_barrier(struct ac_llvm_context *ctx)
+{
+	ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
+			   0, AC_FUNC_ATTR_CONVERGENT);
+}
+
+/* Prevent optimizations (at least of memory accesses) across the current
+ * point in the program by emitting empty inline assembly that is marked as
+ * having side effects.
+ *
+ * Optionally, a value can be passed through the inline assembly to prevent
+ * LLVM from hoisting calls to ReadNone functions.
+ */
+void
+ac_build_optimization_barrier(struct ac_llvm_context *ctx,
+			      LLVMValueRef *pvgpr)
+{
+	static int counter = 0;
+
+	LLVMBuilderRef builder = ctx->builder;
+	char code[16];
+
+	snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
+
+	if (!pvgpr) {
+		LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+		LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
+		LLVMBuildCall(builder, inlineasm, NULL, 0, "");
+	} else {
+		LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
+		LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
+		LLVMValueRef vgpr = *pvgpr;
+		LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
+		unsigned vgpr_size = ac_get_type_size(vgpr_type);
+		LLVMValueRef vgpr0;
+
+		assert(vgpr_size % 4 == 0);
+
+		vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
+		vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
+		vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
+		vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
+		vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
+
+		*pvgpr = vgpr;
+	}
+}
+
+LLVMValueRef
+ac_build_shader_clock(struct ac_llvm_context *ctx)
+{
+	const char *intr = LLVM_VERSION_MAJOR >= 9 && ctx->chip_class >= GFX8 ?
+				"llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
+	LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
+	return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
+}
+
+LLVMValueRef
+ac_build_ballot(struct ac_llvm_context *ctx,
+		LLVMValueRef value)
+{
+	const char *name;
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		if (ctx->wave_size == 64)
+			name = "llvm.amdgcn.icmp.i64.i32";
+		else
+			name = "llvm.amdgcn.icmp.i32.i32";
+	} else {
+		name = "llvm.amdgcn.icmp.i32";
+	}
+	LLVMValueRef args[3] = {
+		value,
+		ctx->i32_0,
+		LLVMConstInt(ctx->i32, LLVMIntNE, 0)
+	};
+
+	/* We currently have no other way to prevent LLVM from lifting the icmp
+	 * calls to a dominating basic block.
+	 */
+	ac_build_optimization_barrier(ctx, &args[0]);
+
+	args[0] = ac_to_integer(ctx, args[0]);
+
+	return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3,
+				  AC_FUNC_ATTR_NOUNWIND |
+				  AC_FUNC_ATTR_READNONE |
+				  AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
+				 LLVMValueRef value)
+{
+	const char *name = LLVM_VERSION_MAJOR >= 9 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
+	LLVMValueRef args[3] = {
+		value,
+		ctx->i1false,
+		LLVMConstInt(ctx->i32, LLVMIntNE, 0),
+	};
+
+	return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
+				  AC_FUNC_ATTR_NOUNWIND |
+				  AC_FUNC_ATTR_READNONE |
+				  AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+	LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
+	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
+	return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
+}
+
+LLVMValueRef
+ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
+	return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
+			     LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
+}
+
+LLVMValueRef
+ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+	LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
+	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
+
+	LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+					 vote_set, active_set, "");
+	LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+					  vote_set,
+					  LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
+	return LLVMBuildOr(ctx->builder, all, none, "");
+}
+
+LLVMValueRef
+ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
+			       unsigned value_count, unsigned component)
+{
+	LLVMValueRef vec = NULL;
+
+	if (value_count == 1) {
+		return values[component];
+	} else if (!value_count)
+		unreachable("value_count is 0");
+
+	for (unsigned i = component; i < value_count + component; i++) {
+		LLVMValueRef value = values[i];
+
+		if (i == component)
+			vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
+		LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
+		vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
+	}
+	return vec;
+}
+
+LLVMValueRef
+ac_build_gather_values_extended(struct ac_llvm_context *ctx,
+				LLVMValueRef *values,
+				unsigned value_count,
+				unsigned value_stride,
+				bool load,
+				bool always_vector)
+{
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef vec = NULL;
+	unsigned i;
+
+	if (value_count == 1 && !always_vector) {
+		if (load)
+			return LLVMBuildLoad(builder, values[0], "");
+		return values[0];
+	} else if (!value_count)
+		unreachable("value_count is 0");
+
+	for (i = 0; i < value_count; i++) {
+		LLVMValueRef value = values[i * value_stride];
+		if (load)
+			value = LLVMBuildLoad(builder, value, "");
+
+		if (!i)
+			vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
+		LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
+		vec = LLVMBuildInsertElement(builder, vec, value, index, "");
+	}
+	return vec;
+}
+
+LLVMValueRef
+ac_build_gather_values(struct ac_llvm_context *ctx,
+		       LLVMValueRef *values,
+		       unsigned value_count)
+{
+	return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
+}
+
+/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
+ * channels with undef. Extract at most src_channels components from the input.
+ */
+static LLVMValueRef
+ac_build_expand(struct ac_llvm_context *ctx,
+		LLVMValueRef value,
+		unsigned src_channels,
+		unsigned dst_channels)
+{
+	LLVMTypeRef elemtype;
+	LLVMValueRef chan[dst_channels];
+
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+		unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+		if (src_channels == dst_channels && vec_size == dst_channels)
+			return value;
+
+		src_channels = MIN2(src_channels, vec_size);
+
+		for (unsigned i = 0; i < src_channels; i++)
+			chan[i] = ac_llvm_extract_elem(ctx, value, i);
+
+		elemtype = LLVMGetElementType(LLVMTypeOf(value));
+	} else {
+		if (src_channels) {
+			assert(src_channels == 1);
+			chan[0] = value;
+		}
+		elemtype = LLVMTypeOf(value);
+	}
+
+	for (unsigned i = src_channels; i < dst_channels; i++)
+		chan[i] = LLVMGetUndef(elemtype);
+
+	return ac_build_gather_values(ctx, chan, dst_channels);
+}
+
+/* Extract components [start, start + channels) from a vector.
+ */
+LLVMValueRef
+ac_extract_components(struct ac_llvm_context *ctx,
+		      LLVMValueRef value,
+		      unsigned start,
+		      unsigned channels)
+{
+	LLVMValueRef chan[channels];
+
+	for (unsigned i = 0; i < channels; i++)
+		chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
+
+	return ac_build_gather_values(ctx, chan, channels);
+}
+
+/* Expand a scalar or vector to <4 x type> by filling the remaining channels
+ * with undef. Extract at most num_channels components from the input.
+ */
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+				     LLVMValueRef value,
+				     unsigned num_channels)
+{
+	return ac_build_expand(ctx, value, num_channels, 4);
+}
+
+LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+	unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
+	const char *name;
+
+	if (type_size == 2)
+		name = "llvm.rint.f16";
+	else if (type_size == 4)
+		name = "llvm.rint.f32";
+	else
+		name = "llvm.rint.f64";
+
+	return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef
+ac_build_fdiv(struct ac_llvm_context *ctx,
+	      LLVMValueRef num,
+	      LLVMValueRef den)
+{
+	/* If we do (num / den), LLVM >= 7.0 does:
+	 *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
+	 *
+	 * If we do (num * (1 / den)), LLVM does:
+	 *    return num * v_rcp_f32(den);
+	 */
+	LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
+	LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
+	LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
+
+	/* Use v_rcp_f32 instead of precise division. */
+	if (!LLVMIsConstant(ret))
+		LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
+	return ret;
+}
+
+/* See fast_idiv_by_const.h. */
+/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
+LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
+				LLVMValueRef num,
+				LLVMValueRef multiplier,
+				LLVMValueRef pre_shift,
+				LLVMValueRef post_shift,
+				LLVMValueRef increment)
+{
+	LLVMBuilderRef builder = ctx->builder;
+
+	num = LLVMBuildLShr(builder, num, pre_shift, "");
+	num = LLVMBuildMul(builder,
+			   LLVMBuildZExt(builder, num, ctx->i64, ""),
+			   LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+	num = LLVMBuildAdd(builder, num,
+			   LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
+	num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+	num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+	return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* If num != UINT_MAX, this more efficient version can be used. */
+/* Set: increment = util_fast_udiv_info::increment; */
+LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
+				    LLVMValueRef num,
+				    LLVMValueRef multiplier,
+				    LLVMValueRef pre_shift,
+				    LLVMValueRef post_shift,
+				    LLVMValueRef increment)
+{
+	LLVMBuilderRef builder = ctx->builder;
+
+	num = LLVMBuildLShr(builder, num, pre_shift, "");
+	num = LLVMBuildNUWAdd(builder, num, increment, "");
+	num = LLVMBuildMul(builder,
+			   LLVMBuildZExt(builder, num, ctx->i64, ""),
+			   LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+	num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+	num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+	return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* Both operands must fit in 31 bits and the divisor must not be 1. */
+LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
+					      LLVMValueRef num,
+					      LLVMValueRef multiplier,
+					      LLVMValueRef post_shift)
+{
+	LLVMBuilderRef builder = ctx->builder;
+
+	num = LLVMBuildMul(builder,
+			   LLVMBuildZExt(builder, num, ctx->i64, ""),
+			   LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+	num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+	num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+	return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
+ * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
+ * already multiplied by two. id is the cube face number.
+ */
+struct cube_selection_coords {
+	LLVMValueRef stc[2];
+	LLVMValueRef ma;
+	LLVMValueRef id;
+};
+
+static void
+build_cube_intrinsic(struct ac_llvm_context *ctx,
+		     LLVMValueRef in[3],
+		     struct cube_selection_coords *out)
+{
+	LLVMTypeRef f32 = ctx->f32;
+
+	out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
+					 f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
+					 f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
+				     f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
+				     f32, in, 3, AC_FUNC_ATTR_READNONE);
+}
+
+/**
+ * Build a manual selection sequence for cube face sc/tc coordinates and
+ * major axis vector (multiplied by 2 for consistency) for the given
+ * vec3 \p coords, for the face implied by \p selcoords.
+ *
+ * For the major axis, we always adjust the sign to be in the direction of
+ * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
+ * the selcoords major axis.
+ */
+static void build_cube_select(struct ac_llvm_context *ctx,
+			      const struct cube_selection_coords *selcoords,
+			      const LLVMValueRef *coords,
+			      LLVMValueRef *out_st,
+			      LLVMValueRef *out_ma)
+{
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
+	LLVMValueRef is_ma_positive;
+	LLVMValueRef sgn_ma;
+	LLVMValueRef is_ma_z, is_not_ma_z;
+	LLVMValueRef is_ma_y;
+	LLVMValueRef is_ma_x;
+	LLVMValueRef sgn;
+	LLVMValueRef tmp;
+
+	is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
+		selcoords->ma, LLVMConstReal(f32, 0.0), "");
+	sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
+		LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
+
+	is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
+	is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
+	is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
+		LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
+	is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
+
+	/* Select sc */
+	tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
+	sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
+		LLVMBuildSelect(builder, is_ma_z, sgn_ma,
+			LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
+	out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
+
+	/* Select tc */
+	tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
+	sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
+		LLVMConstReal(f32, -1.0), "");
+	out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
+
+	/* Select ma */
+	tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
+		LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
+	tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
+				 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
+	*out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
+}
+
+void
+ac_prepare_cube_coords(struct ac_llvm_context *ctx,
+		       bool is_deriv, bool is_array, bool is_lod,
+		       LLVMValueRef *coords_arg,
+		       LLVMValueRef *derivs_arg)
+{
+
+	LLVMBuilderRef builder = ctx->builder;
+	struct cube_selection_coords selcoords;
+	LLVMValueRef coords[3];
+	LLVMValueRef invma;
+
+	if (is_array && !is_lod) {
+		LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
+
+		/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
+		 *
+		 *    "For Array forms, the array layer used will be
+		 *
+		 *       max(0, min(d−1, floor(layer+0.5)))
+		 *
+		 *     where d is the depth of the texture array and layer
+		 *     comes from the component indicated in the tables below.
+		 *     Workaroudn for an issue where the layer is taken from a
+		 *     helper invocation which happens to fall on a different
+		 *     layer due to extrapolation."
+		 *
+		 * GFX8 and earlier attempt to implement this in hardware by
+		 * clamping the value of coords[2] = (8 * layer) + face.
+		 * Unfortunately, this means that the we end up with the wrong
+		 * face when clamping occurs.
+		 *
+		 * Clamp the layer earlier to work around the issue.
+		 */
+		if (ctx->chip_class <= GFX8) {
+			LLVMValueRef ge0;
+			ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
+			tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
+		}
+
+		coords_arg[3] = tmp;
+	}
+
+	build_cube_intrinsic(ctx, coords_arg, &selcoords);
+
+	invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
+			ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
+	invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
+
+	for (int i = 0; i < 2; ++i)
+		coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
+
+	coords[2] = selcoords.id;
+
+	if (is_deriv && derivs_arg) {
+		LLVMValueRef derivs[4];
+		int axis;
+
+		/* Convert cube derivatives to 2D derivatives. */
+		for (axis = 0; axis < 2; axis++) {
+			LLVMValueRef deriv_st[2];
+			LLVMValueRef deriv_ma;
+
+			/* Transform the derivative alongside the texture
+			 * coordinate. Mathematically, the correct formula is
+			 * as follows. Assume we're projecting onto the +Z face
+			 * and denote by dx/dh the derivative of the (original)
+			 * X texture coordinate with respect to horizontal
+			 * window coordinates. The projection onto the +Z face
+			 * plane is:
+			 *
+			 *   f(x,z) = x/z
+			 *
+			 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
+			 *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
+			 *
+			 * This motivatives the implementation below.
+			 *
+			 * Whether this actually gives the expected results for
+			 * apps that might feed in derivatives obtained via
+			 * finite differences is anyone's guess. The OpenGL spec
+			 * seems awfully quiet about how textureGrad for cube
+			 * maps should be handled.
+			 */
+			build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
+					  deriv_st, &deriv_ma);
+
+			deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
+
+			for (int i = 0; i < 2; ++i)
+				derivs[axis * 2 + i] =
+					LLVMBuildFSub(builder,
+						LLVMBuildFMul(builder, deriv_st[i], invma, ""),
+						LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
+		}
+
+		memcpy(derivs_arg, derivs, sizeof(derivs));
+	}
+
+	/* Shift the texture coordinate. This must be applied after the
+	 * derivative calculation.
+	 */
+	for (int i = 0; i < 2; ++i)
+		coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
+
+	if (is_array) {
+		/* for cube arrays coord.z = coord.w(array_index) * 8 + face */
+		/* coords_arg.w component - array_index for cube arrays */
+		coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
+	}
+
+	memcpy(coords_arg, coords, sizeof(coords));
+}
+
+
+LLVMValueRef
+ac_build_fs_interp(struct ac_llvm_context *ctx,
+		   LLVMValueRef llvm_chan,
+		   LLVMValueRef attr_number,
+		   LLVMValueRef params,
+		   LLVMValueRef i,
+		   LLVMValueRef j)
+{
+	LLVMValueRef args[5];
+	LLVMValueRef p1;
+
+	args[0] = i;
+	args[1] = llvm_chan;
+	args[2] = attr_number;
+	args[3] = params;
+
+	p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
+				ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+
+	args[0] = p1;
+	args[1] = j;
+	args[2] = llvm_chan;
+	args[3] = attr_number;
+	args[4] = params;
+
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
+				  ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params,
+		       LLVMValueRef i,
+		       LLVMValueRef j)
+{
+	LLVMValueRef args[6];
+	LLVMValueRef p1;
+
+	args[0] = i;
+	args[1] = llvm_chan;
+	args[2] = attr_number;
+	args[3] = ctx->i1false;
+	args[4] = params;
+
+	p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+				ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+	args[0] = p1;
+	args[1] = j;
+	args[2] = llvm_chan;
+	args[3] = attr_number;
+	args[4] = ctx->i1false;
+	args[5] = params;
+
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+				  ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef
+ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
+		       LLVMValueRef parameter,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params)
+{
+	LLVMValueRef args[4];
+
+	args[0] = parameter;
+	args[1] = llvm_chan;
+	args[2] = attr_number;
+	args[3] = params;
+
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
+				  ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index)
+{
+	return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
+LLVMValueRef
+ac_build_gep0(struct ac_llvm_context *ctx,
+	      LLVMValueRef base_ptr,
+	      LLVMValueRef index)
+{
+	LLVMValueRef indices[2] = {
+		ctx->i32_0,
+		index,
+	};
+	return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
+}
+
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				  LLVMValueRef index)
+{
+	return LLVMBuildPointerCast(ctx->builder,
+				    LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
+				    LLVMTypeOf(ptr), "");
+}
+
+void
+ac_build_indexed_store(struct ac_llvm_context *ctx,
+		       LLVMValueRef base_ptr, LLVMValueRef index,
+		       LLVMValueRef value)
+{
+	LLVMBuildStore(ctx->builder, value,
+		       ac_build_gep0(ctx, base_ptr, index));
+}
+
+/**
+ * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
+ * It's equivalent to doing a load from &base_ptr[index].
+ *
+ * \param base_ptr  Where the array starts.
+ * \param index     The element index into the array.
+ * \param uniform   Whether the base_ptr and index can be assumed to be
+ *                  dynamically uniform (i.e. load to an SGPR)
+ * \param invariant Whether the load is invariant (no other opcodes affect it)
+ * \param no_unsigned_wraparound
+ *    For all possible re-associations and re-distributions of an expression
+ *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
+ *    without inbounds in base_ptr), this parameter is true if "addr + offset"
+ *    does not result in an unsigned integer wraparound. This is used for
+ *    optimal code generation of 32-bit pointer arithmetic.
+ *
+ *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
+ *    integer wraparound can't be an imm offset in s_load_dword, because
+ *    the instruction performs "addr + offset" in 64 bits.
+ *
+ *    Expected usage for bindless textures by chaining GEPs:
+ *      // possible unsigned wraparound, don't use InBounds:
+ *      ptr1 = LLVMBuildGEP(base_ptr, index);
+ *      image = load(ptr1); // becomes "s_load ptr1, 0"
+ *
+ *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
+ *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
+ */
+static LLVMValueRef
+ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
+		     LLVMValueRef index, bool uniform, bool invariant,
+		     bool no_unsigned_wraparound)
+{
+	LLVMValueRef pointer, result;
+
+	if (no_unsigned_wraparound &&
+	    LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
+		pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
+	else
+		pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+
+	if (uniform)
+		LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
+	result = LLVMBuildLoad(ctx->builder, pointer, "");
+	if (invariant)
+		LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
+	return result;
+}
+
+LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
+			   LLVMValueRef index)
+{
+	return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
+}
+
+LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
+				     LLVMValueRef base_ptr, LLVMValueRef index)
+{
+	return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
+}
+
+/* This assumes that there is no unsigned integer wraparound during the address
+ * computation, excluding all GEPs within base_ptr. */
+LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index)
+{
+	return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
+}
+
+/* See ac_build_load_custom() documentation. */
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index)
+{
+	return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
+}
+
+static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
+				      unsigned cache_policy)
+{
+	return cache_policy |
+	       (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
+}
+
+static void
+ac_build_buffer_store_common(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef data,
+			     LLVMValueRef vindex,
+			     LLVMValueRef voffset,
+			     LLVMValueRef soffset,
+			     unsigned num_channels,
+			     LLVMTypeRef return_channel_type,
+			     unsigned cache_policy,
+			     bool use_format,
+			     bool structurized)
+{
+	LLVMValueRef args[6];
+	int idx = 0;
+	args[idx++] = data;
+	args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+	if (structurized)
+		args[idx++] = vindex ? vindex : ctx->i32_0;
+	args[idx++] = voffset ? voffset : ctx->i32_0;
+	args[idx++] = soffset ? soffset : ctx->i32_0;
+	args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+	unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
+	const char *indexing_kind = structurized ? "struct" : "raw";
+	char name[256], type_name[8];
+
+	LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
+	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
+
+	if (use_format) {
+		snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
+			 indexing_kind, type_name);
+	} else {
+		snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
+			 indexing_kind, type_name);
+	}
+
+	ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+			   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
+}
+
+void
+ac_build_buffer_store_format(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef data,
+			     LLVMValueRef vindex,
+			     LLVMValueRef voffset,
+			     unsigned num_channels,
+			     unsigned cache_policy)
+{
+	ac_build_buffer_store_common(ctx, rsrc, data, vindex,
+				     voffset, NULL, num_channels,
+				     ctx->f32, cache_policy,
+				     true, true);
+}
+
+/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
+ * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
+ * or v4i32 (num_channels=3,4).
+ */
+void
+ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vdata,
+			    unsigned num_channels,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    unsigned inst_offset,
+			    unsigned cache_policy,
+			    bool swizzle_enable_hint)
+{
+	/* Split 3 channel stores, because only LLVM 9+ support 3-channel
+	 * intrinsics. */
+	if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
+		LLVMValueRef v[3], v01;
+
+		for (int i = 0; i < 3; i++) {
+			v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+					LLVMConstInt(ctx->i32, i, 0), "");
+		}
+		v01 = ac_build_gather_values(ctx, v, 2);
+
+		ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+					    soffset, inst_offset, cache_policy,
+					    swizzle_enable_hint);
+		ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+					    soffset, inst_offset + 8,
+					    cache_policy,
+					    swizzle_enable_hint);
+		return;
+	}
+
+	/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
+	 * (voffset is swizzled, but soffset isn't swizzled).
+	 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
+	 */
+	if (!swizzle_enable_hint) {
+		LLVMValueRef offset = soffset;
+
+		if (inst_offset)
+			offset = LLVMBuildAdd(ctx->builder, offset,
+					      LLVMConstInt(ctx->i32, inst_offset, 0), "");
+
+		ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata),
+					     ctx->i32_0, voffset, offset,
+					     num_channels, ctx->f32,
+					     cache_policy, false, false);
+		return;
+	}
+
+	static const unsigned dfmts[] = {
+		V_008F0C_BUF_DATA_FORMAT_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32_32,
+		V_008F0C_BUF_DATA_FORMAT_32_32_32_32
+	};
+	unsigned dfmt = dfmts[num_channels - 1];
+	unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+	LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
+
+	ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
+			           immoffset, num_channels, dfmt, nfmt, cache_policy);
+}
+
+static LLVMValueRef
+ac_build_buffer_load_common(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vindex,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    unsigned num_channels,
+			    LLVMTypeRef channel_type,
+			    unsigned cache_policy,
+			    bool can_speculate,
+			    bool use_format,
+			    bool structurized)
+{
+	LLVMValueRef args[5];
+	int idx = 0;
+	args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+	if (structurized)
+		args[idx++] = vindex ? vindex : ctx->i32_0;
+	args[idx++] = voffset ? voffset : ctx->i32_0;
+	args[idx++] = soffset ? soffset : ctx->i32_0;
+	args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
+	unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
+	const char *indexing_kind = structurized ? "struct" : "raw";
+	char name[256], type_name[8];
+
+	LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
+	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
+
+	if (use_format) {
+		snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
+			 indexing_kind, type_name);
+	} else {
+		snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
+			 indexing_kind, type_name);
+	}
+
+	return ac_build_intrinsic(ctx, name, type, args, idx,
+				  ac_get_load_intr_attribs(can_speculate));
+}
+
+LLVMValueRef
+ac_build_buffer_load(struct ac_llvm_context *ctx,
+		     LLVMValueRef rsrc,
+		     int num_channels,
+		     LLVMValueRef vindex,
+		     LLVMValueRef voffset,
+		     LLVMValueRef soffset,
+		     unsigned inst_offset,
+		     unsigned cache_policy,
+		     bool can_speculate,
+		     bool allow_smem)
+{
+	LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
+	if (voffset)
+		offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
+	if (soffset)
+		offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
+
+	if (allow_smem && !(cache_policy & ac_slc) &&
+	    (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
+		assert(vindex == NULL);
+
+		LLVMValueRef result[8];
+
+		for (int i = 0; i < num_channels; i++) {
+			if (i) {
+				offset = LLVMBuildAdd(ctx->builder, offset,
+						      LLVMConstInt(ctx->i32, 4, 0), "");
+			}
+			LLVMValueRef args[3] = {
+				rsrc,
+				offset,
+				LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
+			};
+			result[i] = ac_build_intrinsic(ctx,
+						       "llvm.amdgcn.s.buffer.load.f32",
+						       ctx->f32, args, 3,
+						       AC_FUNC_ATTR_READNONE);
+		}
+		if (num_channels == 1)
+			return result[0];
+
+		if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
+			result[num_channels++] = LLVMGetUndef(ctx->f32);
+		return ac_build_gather_values(ctx, result, num_channels);
+	}
+
+	return ac_build_buffer_load_common(ctx, rsrc, vindex,
+					   offset, ctx->i32_0,
+					   num_channels, ctx->f32,
+					   cache_policy,
+					   can_speculate, false, false);
+}
+
+LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
+					 LLVMValueRef rsrc,
+					 LLVMValueRef vindex,
+					 LLVMValueRef voffset,
+					 unsigned num_channels,
+					 unsigned cache_policy,
+					 bool can_speculate)
+{
+	return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
+					   ctx->i32_0, num_channels, ctx->f32,
+					   cache_policy, can_speculate,
+					   true, true);
+}
+
+static LLVMValueRef
+ac_build_tbuffer_load(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vindex,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    LLVMValueRef immoffset,
+			    unsigned num_channels,
+			    unsigned dfmt,
+			    unsigned nfmt,
+			    unsigned cache_policy,
+			    bool can_speculate,
+			    bool structurized)
+{
+	voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+	LLVMValueRef args[6];
+	int idx = 0;
+	args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+	if (structurized)
+		args[idx++] = vindex ? vindex : ctx->i32_0;
+	args[idx++] = voffset ? voffset : ctx->i32_0;
+	args[idx++] = soffset ? soffset : ctx->i32_0;
+	args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
+	args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
+	unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
+	const char *indexing_kind = structurized ? "struct" : "raw";
+	char name[256], type_name[8];
+
+	LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
+	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
+
+	snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
+		 indexing_kind, type_name);
+
+	return ac_build_intrinsic(ctx, name, type, args, idx,
+				  ac_get_load_intr_attribs(can_speculate));
+}
+
+LLVMValueRef
+ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef vindex,
+			     LLVMValueRef voffset,
+			     LLVMValueRef soffset,
+			     LLVMValueRef immoffset,
+			     unsigned num_channels,
+			     unsigned dfmt,
+			     unsigned nfmt,
+			     unsigned cache_policy,
+			     bool can_speculate)
+{
+	return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
+				     immoffset, num_channels, dfmt, nfmt,
+				     cache_policy, can_speculate, true);
+}
+
+LLVMValueRef
+ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
+			  LLVMValueRef rsrc,
+			  LLVMValueRef voffset,
+			  LLVMValueRef soffset,
+			  LLVMValueRef immoffset,
+			  unsigned num_channels,
+			  unsigned dfmt,
+			  unsigned nfmt,
+			  unsigned cache_policy,
+		          bool can_speculate)
+{
+	return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
+				     immoffset, num_channels, dfmt, nfmt,
+				     cache_policy, can_speculate, false);
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    LLVMValueRef immoffset,
+			    unsigned cache_policy)
+{
+	LLVMValueRef res;
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
+		res = ac_build_buffer_load_common(ctx, rsrc, NULL,
+						  voffset, soffset,
+						  1, ctx->i16, cache_policy,
+					          false, false, false);
+	} else {
+		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+		unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+		res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
+						immoffset, 1, dfmt, nfmt, cache_policy,
+						false);
+
+		res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+	}
+
+	return res;
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
+			   LLVMValueRef rsrc,
+			   LLVMValueRef voffset,
+			   LLVMValueRef soffset,
+			   LLVMValueRef immoffset,
+			   unsigned cache_policy)
+{
+	LLVMValueRef res;
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
+		res = ac_build_buffer_load_common(ctx, rsrc, NULL,
+						  voffset, soffset,
+						  1, ctx->i8, cache_policy,
+						  false, false, false);
+	} else {
+		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
+		unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+		res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
+						immoffset, 1, dfmt, nfmt, cache_policy,
+						false);
+
+		res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
+	}
+
+	return res;
+}
+
+/**
+ * Convert an 11- or 10-bit unsigned floating point number to an f32.
+ *
+ * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
+ * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
+ */
+static LLVMValueRef
+ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
+{
+	assert(LLVMTypeOf(src) == ctx->i32);
+
+	LLVMValueRef tmp;
+	LLVMValueRef mantissa;
+	mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
+
+	/* Converting normal numbers is just a shift + correcting the exponent bias */
+	unsigned normal_shift = 23 - mant_bits;
+	unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
+	LLVMValueRef shifted, normal;
+
+	shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
+	normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
+
+	/* Converting nan/inf numbers is the same, but with a different exponent update */
+	LLVMValueRef naninf;
+	naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
+
+	/* Converting denormals is the complex case: determine the leading zeros of the
+	 * mantissa to obtain the correct shift for the mantissa and exponent correction.
+	 */
+	LLVMValueRef denormal;
+	LLVMValueRef params[2] = {
+		mantissa,
+		ctx->i1true, /* result can be undef when arg is 0 */
+	};
+	LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
+					      params, 2, AC_FUNC_ATTR_READNONE);
+
+	/* Shift such that the leading 1 ends up as the LSB of the exponent field. */
+	tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
+	denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
+
+	unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
+	tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
+	tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
+	denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
+
+	/* Select the final result. */
+	LLVMValueRef result;
+
+	tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
+			    LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
+	result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
+
+	tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
+			    LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
+	result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
+
+	tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
+	result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
+
+	return ac_to_float(ctx, result);
+}
+
+/**
+ * Generate a fully general open coded buffer format fetch with all required
+ * fixups suitable for vertex fetch, using non-format buffer loads.
+ *
+ * Some combinations of argument values have special interpretations:
+ * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
+ * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
+ *
+ * \param log_size log(size of channel in bytes)
+ * \param num_channels number of channels (1 to 4)
+ * \param format AC_FETCH_FORMAT_xxx value
+ * \param reverse whether XYZ channels are reversed
+ * \param known_aligned whether the source is known to be aligned to hardware's
+ *                      effective element size for loading the given format
+ *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
+ * \param rsrc buffer resource descriptor
+ * \return the resulting vector of floats or integers bitcast to <4 x i32>
+ */
+LLVMValueRef
+ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
+			       unsigned log_size,
+			       unsigned num_channels,
+			       unsigned format,
+			       bool reverse,
+			       bool known_aligned,
+			       LLVMValueRef rsrc,
+			       LLVMValueRef vindex,
+			       LLVMValueRef voffset,
+			       LLVMValueRef soffset,
+			       unsigned cache_policy,
+			       bool can_speculate)
+{
+	LLVMValueRef tmp;
+	unsigned load_log_size = log_size;
+	unsigned load_num_channels = num_channels;
+	if (log_size == 3) {
+		load_log_size = 2;
+		if (format == AC_FETCH_FORMAT_FLOAT) {
+			load_num_channels = 2 * num_channels;
+		} else {
+			load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
+		}
+	}
+
+	int log_recombine = 0;
+	if (ctx->chip_class == GFX6 && !known_aligned) {
+		/* Avoid alignment restrictions by loading one byte at a time. */
+		load_num_channels <<= load_log_size;
+		log_recombine = load_log_size;
+		load_log_size = 0;
+	} else if (load_num_channels == 2 || load_num_channels == 4) {
+		log_recombine = -util_logbase2(load_num_channels);
+		load_num_channels = 1;
+		load_log_size += -log_recombine;
+	}
+
+	assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
+
+	LLVMValueRef loads[32]; /* up to 32 bytes */
+	for (unsigned i = 0; i < load_num_channels; ++i) {
+		tmp = LLVMBuildAdd(ctx->builder, soffset,
+				   LLVMConstInt(ctx->i32, i << load_log_size, false), "");
+		LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
+					   load_log_size == 1 ? ctx->i16 : ctx->i32;
+		unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
+		loads[i] = ac_build_buffer_load_common(
+				ctx, rsrc, vindex, voffset, tmp,
+				num_channels, channel_type, cache_policy,
+				can_speculate, false, true);
+		if (load_log_size >= 2)
+			loads[i] = ac_to_integer(ctx, loads[i]);
+	}
+
+	if (log_recombine > 0) {
+		/* Recombine bytes if necessary (GFX6 only) */
+		LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
+
+		for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
+			LLVMValueRef accum = NULL;
+			for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
+				tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
+				if (i == 0) {
+					accum = tmp;
+				} else {
+					tmp = LLVMBuildShl(ctx->builder, tmp,
+							   LLVMConstInt(dst_type, 8 * i, false), "");
+					accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
+				}
+			}
+			loads[dst] = accum;
+		}
+	} else if (log_recombine < 0) {
+		/* Split vectors of dwords */
+		if (load_log_size > 2) {
+			assert(load_num_channels == 1);
+			LLVMValueRef loaded = loads[0];
+			unsigned log_split = load_log_size - 2;
+			log_recombine += log_split;
+			load_num_channels = 1 << log_split;
+			load_log_size = 2;
+			for (unsigned i = 0; i < load_num_channels; ++i) {
+				tmp = LLVMConstInt(ctx->i32, i, false);
+				loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
+			}
+		}
+
+		/* Further split dwords and shorts if required */
+		if (log_recombine < 0) {
+			for (unsigned src = load_num_channels,
+			              dst = load_num_channels << -log_recombine;
+			     src > 0; --src) {
+				unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
+				LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
+				LLVMValueRef loaded = loads[src - 1];
+				LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
+				for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
+					tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
+					tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
+					loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
+				}
+			}
+		}
+	}
+
+	if (log_size == 3) {
+		if (format == AC_FETCH_FORMAT_FLOAT) {
+			for (unsigned i = 0; i < num_channels; ++i) {
+				tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
+				loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
+			}
+		} else if (format == AC_FETCH_FORMAT_FIXED) {
+			/* 10_11_11_FLOAT */
+			LLVMValueRef data = loads[0];
+			LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
+			LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
+			tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
+			LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
+			LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
+
+			loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
+			loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
+			loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
+
+			num_channels = 3;
+			log_size = 2;
+			format = AC_FETCH_FORMAT_FLOAT;
+		} else {
+			/* 2_10_10_10 data formats */
+			LLVMValueRef data = loads[0];
+			LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
+			LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
+			loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
+			tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
+			loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
+			tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
+			loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
+			tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
+			loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
+
+			num_channels = 4;
+		}
+	}
+
+	if (format == AC_FETCH_FORMAT_FLOAT) {
+		if (log_size != 2) {
+			for (unsigned chan = 0; chan < num_channels; ++chan) {
+				tmp = ac_to_float(ctx, loads[chan]);
+				if (log_size == 3)
+					tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
+				else if (log_size == 1)
+					tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
+				loads[chan] = ac_to_integer(ctx, tmp);
+			}
+		}
+	} else if (format == AC_FETCH_FORMAT_UINT) {
+		if (log_size != 2) {
+			for (unsigned chan = 0; chan < num_channels; ++chan)
+				loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
+		}
+	} else if (format == AC_FETCH_FORMAT_SINT) {
+		if (log_size != 2) {
+			for (unsigned chan = 0; chan < num_channels; ++chan)
+				loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
+		}
+	} else {
+		bool unsign = format == AC_FETCH_FORMAT_UNORM ||
+			      format == AC_FETCH_FORMAT_USCALED ||
+			      format == AC_FETCH_FORMAT_UINT;
+
+		for (unsigned chan = 0; chan < num_channels; ++chan) {
+			if (unsign) {
+				tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
+			} else {
+				tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
+			}
+
+			LLVMValueRef scale = NULL;
+			if (format == AC_FETCH_FORMAT_FIXED) {
+				assert(log_size == 2);
+				scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
+			} else if (format == AC_FETCH_FORMAT_UNORM) {
+				unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
+				scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
+			} else if (format == AC_FETCH_FORMAT_SNORM) {
+				unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
+				scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
+			}
+			if (scale)
+				tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
+
+			if (format == AC_FETCH_FORMAT_SNORM) {
+				/* Clamp to [-1, 1] */
+				LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
+				LLVMValueRef clamp =
+					LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
+				tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
+			}
+
+			loads[chan] = ac_to_integer(ctx, tmp);
+		}
+	}
+
+	while (num_channels < 4) {
+		if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
+			loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
+		} else {
+			loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
+		}
+		num_channels++;
+	}
+
+	if (reverse) {
+		tmp = loads[0];
+		loads[0] = loads[2];
+		loads[2] = tmp;
+	}
+
+	return ac_build_gather_values(ctx, loads, 4);
+}
+
+static void
+ac_build_tbuffer_store(struct ac_llvm_context *ctx,
+		       LLVMValueRef rsrc,
+		       LLVMValueRef vdata,
+		       LLVMValueRef vindex,
+		       LLVMValueRef voffset,
+		       LLVMValueRef soffset,
+		       LLVMValueRef immoffset,
+		       unsigned num_channels,
+		       unsigned dfmt,
+		       unsigned nfmt,
+		       unsigned cache_policy,
+		       bool structurized)
+{
+	voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
+			       immoffset, "");
+
+	LLVMValueRef args[7];
+	int idx = 0;
+	args[idx++] = vdata;
+	args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+	if (structurized)
+		args[idx++] = vindex ? vindex : ctx->i32_0;
+	args[idx++] = voffset ? voffset : ctx->i32_0;
+	args[idx++] = soffset ? soffset : ctx->i32_0;
+	args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
+	args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+	unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
+	const char *indexing_kind = structurized ? "struct" : "raw";
+	char name[256], type_name[8];
+
+	LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
+	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
+
+	snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
+		 indexing_kind, type_name);
+
+	ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+			   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
+}
+
+void
+ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
+			      LLVMValueRef rsrc,
+			      LLVMValueRef vdata,
+			      LLVMValueRef vindex,
+			      LLVMValueRef voffset,
+			      LLVMValueRef soffset,
+			      LLVMValueRef immoffset,
+			      unsigned num_channels,
+			      unsigned dfmt,
+			      unsigned nfmt,
+			      unsigned cache_policy)
+{
+	ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
+			       immoffset, num_channels, dfmt, nfmt, cache_policy,
+			       true);
+}
+
+void
+ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
+			   LLVMValueRef rsrc,
+			   LLVMValueRef vdata,
+			   LLVMValueRef voffset,
+			   LLVMValueRef soffset,
+			   LLVMValueRef immoffset,
+			   unsigned num_channels,
+			   unsigned dfmt,
+			   unsigned nfmt,
+			   unsigned cache_policy)
+{
+	ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
+			       immoffset, num_channels, dfmt, nfmt, cache_policy,
+			       false);
+}
+
+void
+ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef vdata,
+			     LLVMValueRef voffset,
+			     LLVMValueRef soffset,
+			     unsigned cache_policy)
+{
+	vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
+		ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
+					     voffset, soffset, 1,
+					     ctx->i16, cache_policy,
+					     false, false);
+	} else {
+		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+		unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+		vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
+
+		ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
+					   ctx->i32_0, 1, dfmt, nfmt, cache_policy);
+	}
+}
+
+void
+ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vdata,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    unsigned cache_policy)
+{
+	vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
+		ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
+					     voffset, soffset, 1,
+					     ctx->i8, cache_policy,
+					     false, false);
+	} else {
+		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
+		unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+		vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
+
+		ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
+					   ctx->i32_0, 1, dfmt, nfmt, cache_policy);
+	}
+}
+/**
+ * Set range metadata on an instruction.  This can only be used on load and
+ * call instructions.  If you know an instruction can only produce the values
+ * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
+ * \p lo is the minimum value inclusive.
+ * \p hi is the maximum value exclusive.
+ */
+static void set_range_metadata(struct ac_llvm_context *ctx,
+			       LLVMValueRef value, unsigned lo, unsigned hi)
+{
+	LLVMValueRef range_md, md_args[2];
+	LLVMTypeRef type = LLVMTypeOf(value);
+	LLVMContextRef context = LLVMGetTypeContext(type);
+
+	md_args[0] = LLVMConstInt(type, lo, false);
+	md_args[1] = LLVMConstInt(type, hi, false);
+	range_md = LLVMMDNodeInContext(context, md_args, 2);
+	LLVMSetMetadata(value, ctx->range_md_kind, range_md);
+}
+
+LLVMValueRef
+ac_get_thread_id(struct ac_llvm_context *ctx)
+{
+	LLVMValueRef tid;
+
+	LLVMValueRef tid_args[2];
+	tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
+	tid_args[1] = ctx->i32_0;
+	tid_args[1] = ac_build_intrinsic(ctx,
+					 "llvm.amdgcn.mbcnt.lo", ctx->i32,
+					 tid_args, 2, AC_FUNC_ATTR_READNONE);
+
+	if (ctx->wave_size == 32) {
+		tid = tid_args[1];
+	} else {
+		tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
+					 ctx->i32, tid_args,
+					 2, AC_FUNC_ATTR_READNONE);
+	}
+	set_range_metadata(ctx, tid, 0, ctx->wave_size);
+	return tid;
+}
+
+/*
+ * AMD GCN implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+LLVMValueRef
+ac_build_ddxy(struct ac_llvm_context *ctx,
+	      uint32_t mask,
+	      int idx,
+	      LLVMValueRef val)
+{
+	unsigned tl_lanes[4], trbl_lanes[4];
+	char name[32], type[8];
+	LLVMValueRef tl, trbl;
+	LLVMTypeRef result_type;
+	LLVMValueRef result;
+
+	result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
+
+	if (result_type == ctx->f16)
+		val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
+
+	for (unsigned i = 0; i < 4; ++i) {
+		tl_lanes[i] = i & mask;
+		trbl_lanes[i] = (i & mask) + idx;
+	}
+
+	tl = ac_build_quad_swizzle(ctx, val,
+				   tl_lanes[0], tl_lanes[1],
+				   tl_lanes[2], tl_lanes[3]);
+	trbl = ac_build_quad_swizzle(ctx, val,
+				     trbl_lanes[0], trbl_lanes[1],
+				     trbl_lanes[2], trbl_lanes[3]);
+
+	if (result_type == ctx->f16) {
+		tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
+		trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
+	}
+
+	tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
+	trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
+	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+	ac_build_type_name_for_intr(result_type, type, sizeof(type));
+	snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
+
+	return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
+}
+
+void
+ac_build_sendmsg(struct ac_llvm_context *ctx,
+		 uint32_t msg,
+		 LLVMValueRef wave_id)
+{
+	LLVMValueRef args[2];
+	args[0] = LLVMConstInt(ctx->i32, msg, false);
+	args[1] = wave_id;
+	ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
+}
+
+LLVMValueRef
+ac_build_imsb(struct ac_llvm_context *ctx,
+	      LLVMValueRef arg,
+	      LLVMTypeRef dst_type)
+{
+	LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
+					      dst_type, &arg, 1,
+					      AC_FUNC_ATTR_READNONE);
+
+	/* The HW returns the last bit index from MSB, but NIR/TGSI wants
+	 * the index from LSB. Invert it by doing "31 - msb". */
+	msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
+			   msb, "");
+
+	LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
+	LLVMValueRef cond = LLVMBuildOr(ctx->builder,
+					LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+						      arg, ctx->i32_0, ""),
+					LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+						      arg, all_ones, ""), "");
+
+	return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
+}
+
+LLVMValueRef
+ac_build_umsb(struct ac_llvm_context *ctx,
+	      LLVMValueRef arg,
+	      LLVMTypeRef dst_type)
+{
+	const char *intrin_name;
+	LLVMTypeRef type;
+	LLVMValueRef highest_bit;
+	LLVMValueRef zero;
+	unsigned bitsize;
+
+	bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
+	switch (bitsize) {
+	case 64:
+		intrin_name = "llvm.ctlz.i64";
+		type = ctx->i64;
+		highest_bit = LLVMConstInt(ctx->i64, 63, false);
+		zero = ctx->i64_0;
+		break;
+	case 32:
+		intrin_name = "llvm.ctlz.i32";
+		type = ctx->i32;
+		highest_bit = LLVMConstInt(ctx->i32, 31, false);
+		zero = ctx->i32_0;
+		break;
+	case 16:
+		intrin_name = "llvm.ctlz.i16";
+		type = ctx->i16;
+		highest_bit = LLVMConstInt(ctx->i16, 15, false);
+		zero = ctx->i16_0;
+		break;
+	case 8:
+		intrin_name = "llvm.ctlz.i8";
+		type = ctx->i8;
+		highest_bit = LLVMConstInt(ctx->i8, 7, false);
+		zero = ctx->i8_0;
+		break;
+	default:
+		unreachable(!"invalid bitsize");
+		break;
+	}
+
+	LLVMValueRef params[2] = {
+		arg,
+		ctx->i1true,
+	};
+
+	LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
+					      params, 2,
+					      AC_FUNC_ATTR_READNONE);
+
+	/* The HW returns the last bit index from MSB, but TGSI/NIR wants
+	 * the index from LSB. Invert it by doing "31 - msb". */
+	msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
+
+	if (bitsize == 64) {
+		msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
+	} else if (bitsize < 32) {
+		msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
+	}
+
+	/* check for zero */
+	return LLVMBuildSelect(ctx->builder,
+			       LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
+			       LLVMConstInt(ctx->i32, -1, true), msb, "");
+}
+
+LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	char name[64];
+	snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
+	LLVMValueRef args[2] = {a, b};
+	return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	char name[64];
+	snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
+	LLVMValueRef args[2] = {a, b};
+	return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
+	return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
+LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
+	return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
+LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
+	return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
+LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b)
+{
+	LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
+	return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
+LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+	LLVMTypeRef t = LLVMTypeOf(value);
+	return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
+			     LLVMConstReal(t, 1.0));
+}
+
+void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
+{
+	LLVMValueRef args[9];
+
+	args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+	args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+
+	if (a->compr) {
+		LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+		LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
+
+		args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+				v2i16, "");
+		args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+				v2i16, "");
+		args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+				   ctx->voidt, args, 6, 0);
+	} else {
+		args[2] = a->out[0];
+		args[3] = a->out[1];
+		args[4] = a->out[2];
+		args[5] = a->out[3];
+		args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+				   ctx->voidt, args, 8, 0);
+	}
+}
+
+void ac_build_export_null(struct ac_llvm_context *ctx)
+{
+	struct ac_export_args args;
+
+	args.enabled_channels = 0x0; /* enabled channels */
+	args.valid_mask = 1; /* whether the EXEC mask is valid */
+	args.done = 1; /* DONE bit */
+	args.target = V_008DFC_SQ_EXP_NULL;
+	args.compr = 0; /* COMPR flag (0 = 32-bit export) */
+	args.out[0] = LLVMGetUndef(ctx->f32); /* R */
+	args.out[1] = LLVMGetUndef(ctx->f32); /* G */
+	args.out[2] = LLVMGetUndef(ctx->f32); /* B */
+	args.out[3] = LLVMGetUndef(ctx->f32); /* A */
+
+	ac_build_export(ctx, &args);
+}
+
+static unsigned ac_num_coords(enum ac_image_dim dim)
+{
+	switch (dim) {
+	case ac_image_1d:
+		return 1;
+	case ac_image_2d:
+	case ac_image_1darray:
+		 return 2;
+	case ac_image_3d:
+	case ac_image_cube:
+	case ac_image_2darray:
+	case ac_image_2dmsaa:
+		return 3;
+	case ac_image_2darraymsaa:
+		return 4;
+	default:
+		unreachable("ac_num_coords: bad dim");
+	}
+}
+
+static unsigned ac_num_derivs(enum ac_image_dim dim)
+{
+	switch (dim) {
+	case ac_image_1d:
+	case ac_image_1darray:
+		return 2;
+	case ac_image_2d:
+	case ac_image_2darray:
+	case ac_image_cube:
+		return 4;
+	case ac_image_3d:
+		return 6;
+	case ac_image_2dmsaa:
+	case ac_image_2darraymsaa:
+	default:
+		unreachable("derivatives not supported");
+	}
+}
+
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+	switch (op) {
+	case ac_atomic_swap: return "swap";
+	case ac_atomic_add: return "add";
+	case ac_atomic_sub: return "sub";
+	case ac_atomic_smin: return "smin";
+	case ac_atomic_umin: return "umin";
+	case ac_atomic_smax: return "smax";
+	case ac_atomic_umax: return "umax";
+	case ac_atomic_and: return "and";
+	case ac_atomic_or: return "or";
+	case ac_atomic_xor: return "xor";
+	case ac_atomic_inc_wrap: return "inc";
+	case ac_atomic_dec_wrap: return "dec";
+	}
+	unreachable("bad atomic op");
+}
+
+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+				   struct ac_image_args *a)
+{
+	const char *overload[3] = { "", "", "" };
+	unsigned num_overloads = 0;
+	LLVMValueRef args[18];
+	unsigned num_args = 0;
+	enum ac_image_dim dim = a->dim;
+
+	assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+	       !a->level_zero);
+	assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+		a->opcode != ac_image_store_mip) ||
+	       a->lod);
+	assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+	       (!a->compare && !a->offset));
+	assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+		a->opcode == ac_image_get_lod) ||
+	       !a->bias);
+	assert((a->bias ? 1 : 0) +
+	       (a->lod ? 1 : 0) +
+	       (a->level_zero ? 1 : 0) +
+	       (a->derivs[0] ? 1 : 0) <= 1);
+
+	if (a->opcode == ac_image_get_lod) {
+		switch (dim) {
+		case ac_image_1darray:
+			dim = ac_image_1d;
+			break;
+		case ac_image_2darray:
+		case ac_image_cube:
+			dim = ac_image_2d;
+			break;
+		default:
+			break;
+		}
+	}
+
+	bool sample = a->opcode == ac_image_sample ||
+		      a->opcode == ac_image_gather4 ||
+		      a->opcode == ac_image_get_lod;
+	bool atomic = a->opcode == ac_image_atomic ||
+		      a->opcode == ac_image_atomic_cmpswap;
+	bool load = a->opcode == ac_image_sample ||
+		    a->opcode == ac_image_gather4 ||
+		    a->opcode == ac_image_load ||
+		    a->opcode == ac_image_load_mip;
+	LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+	if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+		args[num_args++] = a->data[0];
+		if (a->opcode == ac_image_atomic_cmpswap)
+			args[num_args++] = a->data[1];
+	}
+
+	if (!atomic)
+		args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+	if (a->offset)
+		args[num_args++] = ac_to_integer(ctx, a->offset);
+	if (a->bias) {
+		args[num_args++] = ac_to_float(ctx, a->bias);
+		overload[num_overloads++] = ".f32";
+	}
+	if (a->compare)
+		args[num_args++] = ac_to_float(ctx, a->compare);
+	if (a->derivs[0]) {
+		unsigned count = ac_num_derivs(dim);
+		for (unsigned i = 0; i < count; ++i)
+			args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+		overload[num_overloads++] = ".f32";
+	}
+	unsigned num_coords =
+		a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+	for (unsigned i = 0; i < num_coords; ++i)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+	if (a->lod)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+	overload[num_overloads++] = sample ? ".f32" : ".i32";
+
+	args[num_args++] = a->resource;
+	if (sample) {
+		args[num_args++] = a->sampler;
+		args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
+	}
+
+	args[num_args++] = ctx->i32_0; /* texfailctrl */
+	args[num_args++] = LLVMConstInt(ctx->i32,
+					load ? get_load_cache_policy(ctx, a->cache_policy) :
+					       a->cache_policy, false);
+
+	const char *name;
+	const char *atomic_subop = "";
+	switch (a->opcode) {
+	case ac_image_sample: name = "sample"; break;
+	case ac_image_gather4: name = "gather4"; break;
+	case ac_image_load: name = "load"; break;
+	case ac_image_load_mip: name = "load.mip"; break;
+	case ac_image_store: name = "store"; break;
+	case ac_image_store_mip: name = "store.mip"; break;
+	case ac_image_atomic:
+		name = "atomic.";
+		atomic_subop = get_atomic_name(a->atomic);
+		break;
+	case ac_image_atomic_cmpswap:
+		name = "atomic.";
+		atomic_subop = "cmpswap";
+		break;
+	case ac_image_get_lod: name = "getlod"; break;
+	case ac_image_get_resinfo: name = "getresinfo"; break;
+	default: unreachable("invalid image opcode");
+	}
+
+	const char *dimname;
+	switch (dim) {
+	case ac_image_1d: dimname = "1d"; break;
+	case ac_image_2d: dimname = "2d"; break;
+	case ac_image_3d: dimname = "3d"; break;
+	case ac_image_cube: dimname = "cube"; break;
+	case ac_image_1darray: dimname = "1darray"; break;
+	case ac_image_2darray: dimname = "2darray"; break;
+	case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+	case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+	default: unreachable("invalid dim");
+	}
+
+	bool lod_suffix =
+		a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+	char intr_name[96];
+	snprintf(intr_name, sizeof(intr_name),
+		 "llvm.amdgcn.image.%s%s" /* base name */
+		 "%s%s%s" /* sample/gather modifiers */
+		 ".%s.%s%s%s%s", /* dimension and type overloads */
+		 name, atomic_subop,
+		 a->compare ? ".c" : "",
+		 a->bias ? ".b" :
+		 lod_suffix ? ".l" :
+		 a->derivs[0] ? ".d" :
+		 a->level_zero ? ".lz" : "",
+		 a->offset ? ".o" : "",
+		 dimname,
+		 atomic ? "i32" : "v4f32",
+		 overload[0], overload[1], overload[2]);
+
+	LLVMTypeRef retty;
+	if (atomic)
+		retty = ctx->i32;
+	else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+		retty = ctx->voidt;
+	else
+		retty = ctx->v4f32;
+
+	LLVMValueRef result =
+		ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+				   a->attributes);
+	if (!sample && retty == ctx->v4f32) {
+		result = LLVMBuildBitCast(ctx->builder, result,
+					  ctx->v4i32, "");
+	}
+	return result;
+}
+
+LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx,
+					     LLVMValueRef rsrc)
+{
+	LLVMValueRef samples;
+
+	/* Read the samples from the descriptor directly.
+	 * Hardware doesn't have any instruction for this.
+	 */
+	samples = LLVMBuildExtractElement(ctx->builder, rsrc,
+					  LLVMConstInt(ctx->i32, 3, 0), "");
+	samples = LLVMBuildLShr(ctx->builder, samples,
+				LLVMConstInt(ctx->i32, 16, 0), "");
+	samples = LLVMBuildAnd(ctx->builder, samples,
+			       LLVMConstInt(ctx->i32, 0xf, 0), "");
+	samples = LLVMBuildShl(ctx->builder, ctx->i32_1,
+			       samples, "");
+	return samples;
+}
+
+LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
+				    LLVMValueRef args[2])
+{
+	LLVMTypeRef v2f16 =
+		LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
+
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+				  args, 2, AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
+				     LLVMValueRef args[2])
+{
+	LLVMValueRef res =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
+				   ctx->v2i16, args, 2,
+				   AC_FUNC_ATTR_READNONE);
+	return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+}
+
+LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
+				     LLVMValueRef args[2])
+{
+	LLVMValueRef res =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
+				   ctx->v2i16, args, 2,
+				   AC_FUNC_ATTR_READNONE);
+	return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+}
+
+/* The 8-bit and 10-bit clamping is for HW workarounds. */
+LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
+				 LLVMValueRef args[2], unsigned bits, bool hi)
+{
+	assert(bits == 8 || bits == 10 || bits == 16);
+
+	LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+		bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
+	LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
+		bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
+	LLVMValueRef max_alpha =
+		bits != 10 ? max_rgb : ctx->i32_1;
+	LLVMValueRef min_alpha =
+		bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
+
+	/* Clamp. */
+	if (bits != 16) {
+		for (int i = 0; i < 2; i++) {
+			bool alpha = hi && i == 1;
+			args[i] = ac_build_imin(ctx, args[i],
+						alpha ? max_alpha : max_rgb);
+			args[i] = ac_build_imax(ctx, args[i],
+						alpha ? min_alpha : min_rgb);
+		}
+	}
+
+	LLVMValueRef res =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
+				   ctx->v2i16, args, 2,
+				   AC_FUNC_ATTR_READNONE);
+	return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+}
+
+/* The 8-bit and 10-bit clamping is for HW workarounds. */
+LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
+				 LLVMValueRef args[2], unsigned bits, bool hi)
+{
+	assert(bits == 8 || bits == 10 || bits == 16);
+
+	LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+		bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
+	LLVMValueRef max_alpha =
+		bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
+
+	/* Clamp. */
+	if (bits != 16) {
+		for (int i = 0; i < 2; i++) {
+			bool alpha = hi && i == 1;
+			args[i] = ac_build_umin(ctx, args[i],
+						alpha ? max_alpha : max_rgb);
+		}
+	}
+
+	LLVMValueRef res =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
+				   ctx->v2i16, args, 2,
+				   AC_FUNC_ATTR_READNONE);
+	return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+}
+
+LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
+{
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
+				  &i1, 1, AC_FUNC_ATTR_READNONE);
+}
+
+void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
+{
+	ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
+			   &i1, 1, 0);
+}
+
+LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
+			  LLVMValueRef offset, LLVMValueRef width,
+			  bool is_signed)
+{
+	LLVMValueRef args[] = {
+		input,
+		offset,
+		width,
+	};
+
+	return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" :
+						   "llvm.amdgcn.ubfe.i32",
+				  ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
+
+}
+
+LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+			   LLVMValueRef s1, LLVMValueRef s2)
+{
+	return LLVMBuildAdd(ctx->builder,
+			    LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
+}
+
+LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+			   LLVMValueRef s1, LLVMValueRef s2)
+{
+	/* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
+	if (ctx->chip_class >= GFX10) {
+		return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32,
+					  (LLVMValueRef []) {s0, s1, s2}, 3,
+					  AC_FUNC_ATTR_READNONE);
+	}
+
+	return LLVMBuildFAdd(ctx->builder,
+			     LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
+}
+
+void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
+{
+	if (!wait_flags)
+		return;
+
+	unsigned lgkmcnt = 63;
+	unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
+	unsigned vscnt = 63;
+
+	if (wait_flags & AC_WAIT_LGKM)
+		lgkmcnt = 0;
+	if (wait_flags & AC_WAIT_VLOAD)
+		vmcnt = 0;
+
+	if (wait_flags & AC_WAIT_VSTORE) {
+		if (ctx->chip_class >= GFX10)
+			vscnt = 0;
+		else
+			vmcnt = 0;
+	}
+
+	/* There is no intrinsic for vscnt(0), so use a fence. */
+	if ((wait_flags & AC_WAIT_LGKM &&
+	     wait_flags & AC_WAIT_VLOAD &&
+	     wait_flags & AC_WAIT_VSTORE) ||
+	    vscnt == 0) {
+		LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
+		return;
+	}
+
+	unsigned simm16 = (lgkmcnt << 8) |
+			  (7 << 4) | /* expcnt */
+			  (vmcnt & 0xf) |
+			  ((vmcnt >> 4) << 14);
+
+	LLVMValueRef args[1] = {
+		LLVMConstInt(ctx->i32, simm16, false),
+	};
+	ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
+			   ctx->voidt, args, 1, 0);
+}
+
+LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    LLVMValueRef src1, LLVMValueRef src2,
+			    unsigned bitsize)
+{
+	LLVMTypeRef type;
+	char *intr;
+
+	if (bitsize == 16) {
+		intr = "llvm.amdgcn.fmed3.f16";
+		type = ctx->f16;
+	} else if (bitsize == 32) {
+		intr = "llvm.amdgcn.fmed3.f32";
+		type = ctx->f32;
+	} else {
+		intr = "llvm.amdgcn.fmed3.f64";
+		type = ctx->f64;
+	}
+
+	LLVMValueRef params[] = {
+		src0,
+		src1,
+		src2,
+	};
+	return ac_build_intrinsic(ctx, intr, type, params, 3,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    unsigned bitsize)
+{
+	LLVMTypeRef type;
+	char *intr;
+
+	if (bitsize == 16) {
+		intr = "llvm.amdgcn.fract.f16";
+		type = ctx->f16;
+	} else if (bitsize == 32) {
+		intr = "llvm.amdgcn.fract.f32";
+		type = ctx->f32;
+	} else {
+		intr = "llvm.amdgcn.fract.f64";
+		type = ctx->f64;
+	}
+
+	LLVMValueRef params[] = {
+		src0,
+	};
+	return ac_build_intrinsic(ctx, intr, type, params, 1,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    unsigned bitsize)
+{
+	LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+	LLVMValueRef zero = LLVMConstInt(type, 0, false);
+	LLVMValueRef one = LLVMConstInt(type, 1, false);
+
+	LLVMValueRef cmp, val;
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
+	return val;
+}
+
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    unsigned bitsize)
+{
+	LLVMValueRef cmp, val, zero, one;
+	LLVMTypeRef type;
+
+	if (bitsize == 16) {
+		type = ctx->f16;
+		zero = ctx->f16_0;
+		one = ctx->f16_1;
+	} else if (bitsize == 32) {
+		type = ctx->f32;
+		zero = ctx->f32_0;
+		one = ctx->f32_1;
+	} else {
+		type = ctx->f64;
+		zero = ctx->f64_0;
+		one = ctx->f64_1;
+	}
+
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
+	return val;
+}
+
+LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
+{
+	LLVMValueRef result;
+	unsigned bitsize;
+
+	bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+
+	switch (bitsize) {
+	case 64:
+		result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
+		break;
+	case 32:
+		result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+		break;
+	case 16:
+		result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
+		break;
+	case 8:
+		result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
+		break;
+	default:
+		unreachable(!"invalid bitsize");
+		break;
+	}
+
+	return result;
+}
+
+LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
+				       LLVMValueRef src0)
+{
+	LLVMValueRef result;
+	unsigned bitsize;
+
+	bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+
+	switch (bitsize) {
+	case 64:
+		result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
+		break;
+	case 32:
+		result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+		break;
+	case 16:
+		result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
+		break;
+	case 8:
+		result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
+					    (LLVMValueRef []) { src0 }, 1,
+					    AC_FUNC_ATTR_READNONE);
+
+		result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
+		break;
+	default:
+		unreachable(!"invalid bitsize");
+		break;
+	}
+
+	return result;
+}
+
+#define AC_EXP_TARGET		0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0		2
+
+enum ac_ir_type {
+	AC_IR_UNDEF,
+	AC_IR_CONST,
+	AC_IR_VALUE,
+};
+
+struct ac_vs_exp_chan
+{
+	LLVMValueRef value;
+	float const_float;
+	enum ac_ir_type type;
+};
+
+struct ac_vs_exp_inst {
+	unsigned offset;
+	LLVMValueRef inst;
+	struct ac_vs_exp_chan chan[4];
+};
+
+struct ac_vs_exports {
+	unsigned num;
+	struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
+};
+
+/* Return true if the PARAM export has been eliminated. */
+static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
+				      uint32_t num_outputs,
+				      struct ac_vs_exp_inst *exp)
+{
+	unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
+	bool is_zero[4] = {}, is_one[4] = {};
+
+	for (i = 0; i < 4; i++) {
+		/* It's a constant expression. Undef outputs are eliminated too. */
+		if (exp->chan[i].type == AC_IR_UNDEF) {
+			is_zero[i] = true;
+			is_one[i] = true;
+		} else if (exp->chan[i].type == AC_IR_CONST) {
+			if (exp->chan[i].const_float == 0)
+				is_zero[i] = true;
+			else if (exp->chan[i].const_float == 1)
+				is_one[i] = true;
+			else
+				return false; /* other constant */
+		} else
+			return false;
+	}
+
+	/* Only certain combinations of 0 and 1 can be eliminated. */
+	if (is_zero[0] && is_zero[1] && is_zero[2])
+		default_val = is_zero[3] ? 0 : 1;
+	else if (is_one[0] && is_one[1] && is_one[2])
+		default_val = is_zero[3] ? 2 : 3;
+	else
+		return false;
+
+	/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
+	LLVMInstructionEraseFromParent(exp->inst);
+
+	/* Change OFFSET to DEFAULT_VAL. */
+	for (i = 0; i < num_outputs; i++) {
+		if (vs_output_param_offset[i] == exp->offset) {
+			vs_output_param_offset[i] =
+				AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
+			break;
+		}
+	}
+	return true;
+}
+
+static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
+					   uint8_t *vs_output_param_offset,
+					   uint32_t num_outputs,
+					   struct ac_vs_exports *processed,
+				           struct ac_vs_exp_inst *exp)
+{
+	unsigned p, copy_back_channels = 0;
+
+	/* See if the output is already in the list of processed outputs.
+	 * The LLVMValueRef comparison relies on SSA.
+	 */
+	for (p = 0; p < processed->num; p++) {
+		bool different = false;
+
+		for (unsigned j = 0; j < 4; j++) {
+			struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
+			struct ac_vs_exp_chan *c2 = &exp->chan[j];
+
+			/* Treat undef as a match. */
+			if (c2->type == AC_IR_UNDEF)
+				continue;
+
+			/* If c1 is undef but c2 isn't, we can copy c2 to c1
+			 * and consider the instruction duplicated.
+			 */
+			if (c1->type == AC_IR_UNDEF) {
+				copy_back_channels |= 1 << j;
+				continue;
+			}
+
+			/* Test whether the channels are not equal. */
+			if (c1->type != c2->type ||
+			    (c1->type == AC_IR_CONST &&
+			     c1->const_float != c2->const_float) ||
+			    (c1->type == AC_IR_VALUE &&
+			     c1->value != c2->value)) {
+				different = true;
+				break;
+			}
+		}
+		if (!different)
+			break;
+
+		copy_back_channels = 0;
+	}
+	if (p == processed->num)
+		return false;
+
+	/* If a match was found, but the matching export has undef where the new
+	 * one has a normal value, copy the normal value to the undef channel.
+	 */
+	struct ac_vs_exp_inst *match = &processed->exp[p];
+
+	/* Get current enabled channels mask. */
+	LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
+	unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
+
+	while (copy_back_channels) {
+		unsigned chan = u_bit_scan(&copy_back_channels);
+
+		assert(match->chan[chan].type == AC_IR_UNDEF);
+		LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
+			       exp->chan[chan].value);
+		match->chan[chan] = exp->chan[chan];
+
+		/* Update number of enabled channels because the original mask
+		 * is not always 0xf.
+		 */
+		enabled_channels |= (1 << chan);
+		LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
+			       LLVMConstInt(ctx->i32, enabled_channels, 0));
+	}
+
+	/* The PARAM export is duplicated. Kill it. */
+	LLVMInstructionEraseFromParent(exp->inst);
+
+	/* Change OFFSET to the matching export. */
+	for (unsigned i = 0; i < num_outputs; i++) {
+		if (vs_output_param_offset[i] == exp->offset) {
+			vs_output_param_offset[i] = match->offset;
+			break;
+		}
+	}
+	return true;
+}
+
+void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
+			    LLVMValueRef main_fn,
+			    uint8_t *vs_output_param_offset,
+			    uint32_t num_outputs,
+			    uint8_t *num_param_exports)
+{
+	LLVMBasicBlockRef bb;
+	bool removed_any = false;
+	struct ac_vs_exports exports;
+
+	exports.num = 0;
+
+	/* Process all LLVM instructions. */
+	bb = LLVMGetFirstBasicBlock(main_fn);
+	while (bb) {
+		LLVMValueRef inst = LLVMGetFirstInstruction(bb);
+
+		while (inst) {
+			LLVMValueRef cur = inst;
+			inst = LLVMGetNextInstruction(inst);
+			struct ac_vs_exp_inst exp;
+
+			if (LLVMGetInstructionOpcode(cur) != LLVMCall)
+				continue;
+
+			LLVMValueRef callee = ac_llvm_get_called_value(cur);
+
+			if (!ac_llvm_is_function(callee))
+				continue;
+
+			const char *name = LLVMGetValueName(callee);
+			unsigned num_args = LLVMCountParams(callee);
+
+			/* Check if this is an export instruction. */
+			if ((num_args != 9 && num_args != 8) ||
+			    (strcmp(name, "llvm.SI.export") &&
+			     strcmp(name, "llvm.amdgcn.exp.f32")))
+				continue;
+
+			LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
+			unsigned target = LLVMConstIntGetZExtValue(arg);
+
+			if (target < V_008DFC_SQ_EXP_PARAM)
+				continue;
+
+			target -= V_008DFC_SQ_EXP_PARAM;
+
+			/* Parse the instruction. */
+			memset(&exp, 0, sizeof(exp));
+			exp.offset = target;
+			exp.inst = cur;
+
+			for (unsigned i = 0; i < 4; i++) {
+				LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
+
+				exp.chan[i].value = v;
+
+				if (LLVMIsUndef(v)) {
+					exp.chan[i].type = AC_IR_UNDEF;
+				} else if (LLVMIsAConstantFP(v)) {
+					LLVMBool loses_info;
+					exp.chan[i].type = AC_IR_CONST;
+					exp.chan[i].const_float =
+						LLVMConstRealGetDouble(v, &loses_info);
+				} else {
+					exp.chan[i].type = AC_IR_VALUE;
+				}
+			}
+
+			/* Eliminate constant and duplicated PARAM exports. */
+			if (ac_eliminate_const_output(vs_output_param_offset,
+						      num_outputs, &exp) ||
+			    ac_eliminate_duplicated_output(ctx,
+							   vs_output_param_offset,
+							   num_outputs, &exports,
+							   &exp)) {
+				removed_any = true;
+			} else {
+				exports.exp[exports.num++] = exp;
+			}
+		}
+		bb = LLVMGetNextBasicBlock(bb);
+	}
+
+	/* Remove holes in export memory due to removed PARAM exports.
+	 * This is done by renumbering all PARAM exports.
+	 */
+	if (removed_any) {
+		uint8_t old_offset[VARYING_SLOT_MAX];
+		unsigned out, i;
+
+		/* Make a copy of the offsets. We need the old version while
+		 * we are modifying some of them. */
+		memcpy(old_offset, vs_output_param_offset,
+		       sizeof(old_offset));
+
+		for (i = 0; i < exports.num; i++) {
+			unsigned offset = exports.exp[i].offset;
+
+			/* Update vs_output_param_offset. Multiple outputs can
+			 * have the same offset.
+			 */
+			for (out = 0; out < num_outputs; out++) {
+				if (old_offset[out] == offset)
+					vs_output_param_offset[out] = i;
+			}
+
+			/* Change the PARAM offset in the instruction. */
+			LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
+				       LLVMConstInt(ctx->i32,
+						    V_008DFC_SQ_EXP_PARAM + i, 0));
+		}
+		*num_param_exports = exports.num;
+	}
+}
+
+void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
+{
+	LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+	ac_build_intrinsic(ctx,
+			   "llvm.amdgcn.init.exec", ctx->voidt,
+			   &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
+}
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
+{
+	unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
+	ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
+				     LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
+				     "lds");
+}
+
+LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
+			 LLVMValueRef dw_addr)
+{
+	return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
+}
+
+void ac_lds_store(struct ac_llvm_context *ctx,
+		  LLVMValueRef dw_addr,
+		  LLVMValueRef value)
+{
+	value = ac_to_integer(ctx, value);
+	ac_build_indexed_store(ctx, ctx->lds,
+			       dw_addr, value);
+}
+
+LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
+			 LLVMTypeRef dst_type,
+			 LLVMValueRef src0)
+{
+	unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+	const char *intrin_name;
+	LLVMTypeRef type;
+	LLVMValueRef zero;
+
+	switch (src0_bitsize) {
+	case 64:
+		intrin_name = "llvm.cttz.i64";
+		type = ctx->i64;
+		zero = ctx->i64_0;
+		break;
+	case 32:
+		intrin_name = "llvm.cttz.i32";
+		type = ctx->i32;
+		zero = ctx->i32_0;
+		break;
+	case 16:
+		intrin_name = "llvm.cttz.i16";
+		type = ctx->i16;
+		zero = ctx->i16_0;
+		break;
+	case 8:
+		intrin_name = "llvm.cttz.i8";
+		type = ctx->i8;
+		zero = ctx->i8_0;
+		break;
+	default:
+		unreachable(!"invalid bitsize");
+	}
+
+	LLVMValueRef params[2] = {
+		src0,
+
+		/* The value of 1 means that ffs(x=0) = undef, so LLVM won't
+		 * add special code to check for x=0. The reason is that
+		 * the LLVM behavior for x=0 is different from what we
+		 * need here. However, LLVM also assumes that ffs(x) is
+		 * in [0, 31], but GLSL expects that ffs(0) = -1, so
+		 * a conditional assignment to handle 0 is still required.
+		 *
+		 * The hardware already implements the correct behavior.
+		 */
+		ctx->i1true,
+	};
+
+	LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
+					      params, 2,
+					      AC_FUNC_ATTR_READNONE);
+
+	if (src0_bitsize == 64) {
+		lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
+	} else if (src0_bitsize < 32) {
+		lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
+	}
+
+	/* TODO: We need an intrinsic to skip this conditional. */
+	/* Check for zero: */
+	return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
+							   LLVMIntEQ, src0,
+							   zero, ""),
+			       LLVMConstInt(ctx->i32, -1, 0), lsb, "");
+}
+
+LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
+{
+	return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
+}
+
+LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
+{
+	return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
+}
+
+static struct ac_llvm_flow *
+get_current_flow(struct ac_llvm_context *ctx)
+{
+	if (ctx->flow->depth > 0)
+		return &ctx->flow->stack[ctx->flow->depth - 1];
+	return NULL;
+}
+
+static struct ac_llvm_flow *
+get_innermost_loop(struct ac_llvm_context *ctx)
+{
+	for (unsigned i = ctx->flow->depth; i > 0; --i) {
+		if (ctx->flow->stack[i - 1].loop_entry_block)
+			return &ctx->flow->stack[i - 1];
+	}
+	return NULL;
+}
+
+static struct ac_llvm_flow *
+push_flow(struct ac_llvm_context *ctx)
+{
+	struct ac_llvm_flow *flow;
+
+	if (ctx->flow->depth >= ctx->flow->depth_max) {
+		unsigned new_max = MAX2(ctx->flow->depth << 1,
+					AC_LLVM_INITIAL_CF_DEPTH);
+
+		ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
+		ctx->flow->depth_max = new_max;
+	}
+
+	flow = &ctx->flow->stack[ctx->flow->depth];
+	ctx->flow->depth++;
+
+	flow->next_block = NULL;
+	flow->loop_entry_block = NULL;
+	return flow;
+}
+
+static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
+				int label_id)
+{
+	char buf[32];
+	snprintf(buf, sizeof(buf), "%s%d", base, label_id);
+	LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
+}
+
+/* Append a basic block at the level of the parent flow.
+ */
+static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
+					    const char *name)
+{
+	assert(ctx->flow->depth >= 1);
+
+	if (ctx->flow->depth >= 2) {
+		struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
+
+		return LLVMInsertBasicBlockInContext(ctx->context,
+						     flow->next_block, name);
+	}
+
+	LLVMValueRef main_fn =
+		LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
+	return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
+}
+
+/* Emit a branch to the given default target for the current block if
+ * applicable -- that is, if the current block does not already contain a
+ * branch from a break or continue.
+ */
+static void emit_default_branch(LLVMBuilderRef builder,
+				LLVMBasicBlockRef target)
+{
+	if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
+		 LLVMBuildBr(builder, target);
+}
+
+void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
+{
+	struct ac_llvm_flow *flow = push_flow(ctx);
+	flow->loop_entry_block = append_basic_block(ctx, "LOOP");
+	flow->next_block = append_basic_block(ctx, "ENDLOOP");
+	set_basicblock_name(flow->loop_entry_block, "loop", label_id);
+	LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+	LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_break(struct ac_llvm_context *ctx)
+{
+	struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+	LLVMBuildBr(ctx->builder, flow->next_block);
+}
+
+void ac_build_continue(struct ac_llvm_context *ctx)
+{
+	struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+	LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_else(struct ac_llvm_context *ctx, int label_id)
+{
+	struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+	LLVMBasicBlockRef endif_block;
+
+	assert(!current_branch->loop_entry_block);
+
+	endif_block = append_basic_block(ctx, "ENDIF");
+	emit_default_branch(ctx->builder, endif_block);
+
+	LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+	set_basicblock_name(current_branch->next_block, "else", label_id);
+
+	current_branch->next_block = endif_block;
+}
+
+void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
+{
+	struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+
+	assert(!current_branch->loop_entry_block);
+
+	emit_default_branch(ctx->builder, current_branch->next_block);
+	LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+	set_basicblock_name(current_branch->next_block, "endif", label_id);
+
+	ctx->flow->depth--;
+}
+
+void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
+{
+	struct ac_llvm_flow *current_loop = get_current_flow(ctx);
+
+	assert(current_loop->loop_entry_block);
+
+	emit_default_branch(ctx->builder, current_loop->loop_entry_block);
+
+	LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
+	set_basicblock_name(current_loop->next_block, "endloop", label_id);
+	ctx->flow->depth--;
+}
+
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
+{
+	struct ac_llvm_flow *flow = push_flow(ctx);
+	LLVMBasicBlockRef if_block;
+
+	if_block = append_basic_block(ctx, "IF");
+	flow->next_block = append_basic_block(ctx, "ELSE");
+	set_basicblock_name(if_block, "if", label_id);
+	LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
+	LLVMPositionBuilderAtEnd(ctx->builder, if_block);
+}
+
+void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
+		 int label_id)
+{
+	LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+					  value, ctx->f32_0, "");
+	ac_build_ifcc(ctx, cond, label_id);
+}
+
+void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
+		  int label_id)
+{
+	LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+					  ac_to_integer(ctx, value),
+					  ctx->i32_0, "");
+	ac_build_ifcc(ctx, cond, label_id);
+}
+
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
+			     const char *name)
+{
+	LLVMBuilderRef builder = ac->builder;
+	LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+	LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+	LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+	LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+	LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
+	LLVMValueRef res;
+
+	if (first_instr) {
+		LLVMPositionBuilderBefore(first_builder, first_instr);
+	} else {
+		LLVMPositionBuilderAtEnd(first_builder, first_block);
+	}
+
+	res = LLVMBuildAlloca(first_builder, type, name);
+	LLVMDisposeBuilder(first_builder);
+	return res;
+}
+
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
+				   LLVMTypeRef type, const char *name)
+{
+	LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
+	LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
+	return ptr;
+}
+
+LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+                         LLVMTypeRef type)
+{
+	int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+	return LLVMBuildBitCast(ctx->builder, ptr,
+	                        LLVMPointerType(type, addr_space), "");
+}
+
+LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
+			    unsigned count)
+{
+	unsigned num_components = ac_get_llvm_num_components(value);
+	if (count == num_components)
+		return value;
+
+	LLVMValueRef masks[MAX2(count, 2)];
+	masks[0] = ctx->i32_0;
+	masks[1] = ctx->i32_1;
+	for (unsigned i = 2; i < count; i++)
+		masks[i] = LLVMConstInt(ctx->i32, i, false);
+
+	if (count == 1)
+		return LLVMBuildExtractElement(ctx->builder, value, masks[0],
+		                               "");
+
+	LLVMValueRef swizzle = LLVMConstVector(masks, count);
+	return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
+}
+
+LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
+			     unsigned rshift, unsigned bitwidth)
+{
+	LLVMValueRef value = param;
+	if (rshift)
+		value = LLVMBuildLShr(ctx->builder, value,
+				      LLVMConstInt(ctx->i32, rshift, false), "");
+
+	if (rshift + bitwidth < 32) {
+		unsigned mask = (1 << bitwidth) - 1;
+		value = LLVMBuildAnd(ctx->builder, value,
+				     LLVMConstInt(ctx->i32, mask, false), "");
+	}
+	return value;
+}
+
+/* Adjust the sample index according to FMASK.
+ *
+ * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+ * which is the identity mapping. Each nibble says which physical sample
+ * should be fetched to get that sample.
+ *
+ * For example, 0x11111100 means there are only 2 samples stored and
+ * the second sample covers 3/4 of the pixel. When reading samples 0
+ * and 1, return physical sample 0 (determined by the first two 0s
+ * in FMASK), otherwise return physical sample 1.
+ *
+ * The sample index should be adjusted as follows:
+ *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
+ */
+void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
+			      LLVMValueRef *addr, bool is_array_tex)
+{
+	struct ac_image_args fmask_load = {};
+	fmask_load.opcode = ac_image_load;
+	fmask_load.resource = fmask;
+	fmask_load.dmask = 0xf;
+	fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
+	fmask_load.attributes = AC_FUNC_ATTR_READNONE;
+
+	fmask_load.coords[0] = addr[0];
+	fmask_load.coords[1] = addr[1];
+	if (is_array_tex)
+		fmask_load.coords[2] = addr[2];
+
+	LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
+	fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
+					      ac->i32_0, "");
+
+	/* Apply the formula. */
+	unsigned sample_chan = is_array_tex ? 3 : 2;
+	LLVMValueRef final_sample;
+	final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
+				    LLVMConstInt(ac->i32, 4, 0), "");
+	final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+	/* Mask the sample index by 0x7, because 0x8 means an unknown value
+	 * with EQAA, so those will map to 0. */
+	final_sample = LLVMBuildAnd(ac->builder, final_sample,
+				    LLVMConstInt(ac->i32, 0x7, 0), "");
+
+	/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+	 * resource descriptor is 0 (invalid).
+	 */
+	LLVMValueRef tmp;
+	tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
+	tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
+	tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
+
+	/* Replace the MSAA sample index. */
+	addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
+					    addr[sample_chan], "");
+}
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+	ac_build_optimization_barrier(ctx, &src);
+	return ac_build_intrinsic(ctx,
+			lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
+			LLVMTypeOf(src), (LLVMValueRef []) {
+			src, lane },
+			lane == NULL ? 1 : 2,
+			AC_FUNC_ATTR_READNONE |
+			AC_FUNC_ATTR_CONVERGENT);
+}
+
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+	LLVMTypeRef src_type = LLVMTypeOf(src);
+	src = ac_to_integer(ctx, src);
+	unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+	LLVMValueRef ret;
+
+	if (bits == 32) {
+		ret = _ac_build_readlane(ctx, src, lane);
+	} else {
+		assert(bits % 32 == 0);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+		LLVMValueRef src_vector =
+			LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+		ret = LLVMGetUndef(vec_type);
+		for (unsigned i = 0; i < bits / 32; i++) {
+			src = LLVMBuildExtractElement(ctx->builder, src_vector,
+						LLVMConstInt(ctx->i32, i, 0), "");
+			LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
+			ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
+						LLVMConstInt(ctx->i32, i, 0), "");
+		}
+	}
+	if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
+		return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
+	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
+{
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
+				  (LLVMValueRef []) {value, lane, src}, 3,
+				  AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+	if (ctx->wave_size == 32) {
+		return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+					  (LLVMValueRef []) { mask, ctx->i32_0 },
+					  2, AC_FUNC_ATTR_READNONE);
+	}
+	LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+						 LLVMVectorType(ctx->i32, 2),
+						 "");
+	LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+						       ctx->i32_0, "");
+	LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+						       ctx->i32_1, "");
+	LLVMValueRef val =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+				   (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+				   2, AC_FUNC_ATTR_READNONE);
+	val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+				 (LLVMValueRef []) { mask_hi, val },
+				 2, AC_FUNC_ATTR_READNONE);
+	return val;
+}
+
+enum dpp_ctrl {
+	_dpp_quad_perm = 0x000,
+	_dpp_row_sl = 0x100,
+	_dpp_row_sr = 0x110,
+	_dpp_row_rr = 0x120,
+	dpp_wf_sl1 = 0x130,
+	dpp_wf_rl1 = 0x134,
+	dpp_wf_sr1 = 0x138,
+	dpp_wf_rr1 = 0x13C,
+	dpp_row_mirror = 0x140,
+	dpp_row_half_mirror = 0x141,
+	dpp_row_bcast15 = 0x142,
+	dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+	assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+	return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+	assert(amount > 0 && amount < 16);
+	return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+	assert(amount > 0 && amount < 16);
+	return _dpp_row_sr | amount;
+}
+
+static LLVMValueRef
+_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+	      enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+	      bool bound_ctrl)
+{
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
+					LLVMTypeOf(old),
+					(LLVMValueRef[]) {
+						old, src,
+						LLVMConstInt(ctx->i32, dpp_ctrl, 0),
+						LLVMConstInt(ctx->i32, row_mask, 0),
+						LLVMConstInt(ctx->i32, bank_mask, 0),
+						LLVMConstInt(ctx->i1, bound_ctrl, 0) },
+					6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+	     enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+	     bool bound_ctrl)
+{
+	LLVMTypeRef src_type = LLVMTypeOf(src);
+	src = ac_to_integer(ctx, src);
+	old = ac_to_integer(ctx, old);
+	unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+	LLVMValueRef ret;
+	if (bits == 32) {
+		ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
+				    bank_mask, bound_ctrl);
+	} else {
+		assert(bits % 32 == 0);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+		LLVMValueRef src_vector =
+			LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+		LLVMValueRef old_vector =
+			LLVMBuildBitCast(ctx->builder, old, vec_type, "");
+		ret = LLVMGetUndef(vec_type);
+		for (unsigned i = 0; i < bits / 32; i++) {
+			src = LLVMBuildExtractElement(ctx->builder, src_vector,
+						      LLVMConstInt(ctx->i32, i,
+								   0), "");
+			old = LLVMBuildExtractElement(ctx->builder, old_vector,
+						      LLVMConstInt(ctx->i32, i,
+								   0), "");
+			LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
+							      dpp_ctrl,
+							      row_mask,
+							      bank_mask,
+							      bound_ctrl);
+			ret = LLVMBuildInsertElement(ctx->builder, ret,
+						     ret_comp,
+						     LLVMConstInt(ctx->i32, i,
+								  0), "");
+		}
+	}
+	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+		     bool exchange_rows, bool bound_ctrl)
+{
+	LLVMValueRef args[6] = {
+		src,
+		src,
+		LLVMConstInt(ctx->i32, sel, false),
+		LLVMConstInt(ctx->i32, sel >> 32, false),
+		ctx->i1true, /* fi */
+		bound_ctrl ? ctx->i1true : ctx->i1false,
+	};
+	return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
+						     : "llvm.amdgcn.permlane16",
+				  ctx->i32, args, 6,
+				  AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+		    bool exchange_rows, bool bound_ctrl)
+{
+	LLVMTypeRef src_type = LLVMTypeOf(src);
+	src = ac_to_integer(ctx, src);
+	unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+	LLVMValueRef ret;
+	if (bits == 32) {
+		ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
+					   bound_ctrl);
+	} else {
+		assert(bits % 32 == 0);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+		LLVMValueRef src_vector =
+			LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+		ret = LLVMGetUndef(vec_type);
+		for (unsigned i = 0; i < bits / 32; i++) {
+			src = LLVMBuildExtractElement(ctx->builder, src_vector,
+						      LLVMConstInt(ctx->i32, i,
+								   0), "");
+			LLVMValueRef ret_comp =
+				_ac_build_permlane16(ctx, src, sel,
+						     exchange_rows,
+						     bound_ctrl);
+			ret = LLVMBuildInsertElement(ctx->builder, ret,
+						     ret_comp,
+						     LLVMConstInt(ctx->i32, i,
+								  0), "");
+		}
+	}
+	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static inline unsigned
+ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
+{
+	assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
+	return and_mask | (or_mask << 5) | (xor_mask << 10);
+}
+
+static LLVMValueRef
+_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
+				   LLVMTypeOf(src), (LLVMValueRef []) {
+					src, LLVMConstInt(ctx->i32, mask, 0) },
+				   2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+	LLVMTypeRef src_type = LLVMTypeOf(src);
+	src = ac_to_integer(ctx, src);
+	unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+	LLVMValueRef ret;
+	if (bits == 32) {
+		ret = _ac_build_ds_swizzle(ctx, src, mask);
+	} else {
+		assert(bits % 32 == 0);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+		LLVMValueRef src_vector =
+			LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+		ret = LLVMGetUndef(vec_type);
+		for (unsigned i = 0; i < bits / 32; i++) {
+			src = LLVMBuildExtractElement(ctx->builder, src_vector,
+						      LLVMConstInt(ctx->i32, i,
+								   0), "");
+			LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
+								     mask);
+			ret = LLVMBuildInsertElement(ctx->builder, ret,
+						     ret_comp,
+						     LLVMConstInt(ctx->i32, i,
+								  0), "");
+		}
+	}
+	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+	char name[32], type[8];
+	ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+	snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
+	return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
+				  (LLVMValueRef []) { src }, 1,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef
+ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
+		      LLVMValueRef inactive)
+{
+	char name[33], type[8];
+	LLVMTypeRef src_type = LLVMTypeOf(src);
+	src = ac_to_integer(ctx, src);
+	inactive = ac_to_integer(ctx, inactive);
+	ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+	snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
+	LLVMValueRef ret =
+		ac_build_intrinsic(ctx, name,
+					LLVMTypeOf(src), (LLVMValueRef []) {
+					src, inactive }, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
+	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
+{
+	if (type_size == 4) {
+		switch (op) {
+		case nir_op_iadd: return ctx->i32_0;
+		case nir_op_fadd: return ctx->f32_0;
+		case nir_op_imul: return ctx->i32_1;
+		case nir_op_fmul: return ctx->f32_1;
+		case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
+		case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
+		case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
+		case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
+		case nir_op_umax: return ctx->i32_0;
+		case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
+		case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
+		case nir_op_ior: return ctx->i32_0;
+		case nir_op_ixor: return ctx->i32_0;
+		default:
+			unreachable("bad reduction intrinsic");
+		}
+	} else { /* type_size == 64bit */
+		switch (op) {
+		case nir_op_iadd: return ctx->i64_0;
+		case nir_op_fadd: return ctx->f64_0;
+		case nir_op_imul: return ctx->i64_1;
+		case nir_op_fmul: return ctx->f64_1;
+		case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
+		case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
+		case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
+		case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
+		case nir_op_umax: return ctx->i64_0;
+		case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
+		case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
+		case nir_op_ior: return ctx->i64_0;
+		case nir_op_ixor: return ctx->i64_0;
+		default:
+			unreachable("bad reduction intrinsic");
+		}
+	}
+}
+
+static LLVMValueRef
+ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
+{
+	bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
+	switch (op) {
+	case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
+	case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
+	case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
+	case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
+	case nir_op_imin: return LLVMBuildSelect(ctx->builder,
+					LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
+					lhs, rhs, "");
+	case nir_op_umin: return LLVMBuildSelect(ctx->builder,
+					LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
+					lhs, rhs, "");
+	case nir_op_fmin: return ac_build_intrinsic(ctx,
+					_64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
+					_64bit ? ctx->f64 : ctx->f32,
+					(LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+	case nir_op_imax: return LLVMBuildSelect(ctx->builder,
+					LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
+					lhs, rhs, "");
+	case nir_op_umax: return LLVMBuildSelect(ctx->builder,
+					LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
+					lhs, rhs, "");
+	case nir_op_fmax: return ac_build_intrinsic(ctx,
+					_64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
+					_64bit ? ctx->f64 : ctx->f32,
+					(LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+	case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
+	case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
+	case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
+	default:
+		unreachable("bad reduction intrinsic");
+	}
+}
+
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ *     prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for GFX6.
+ */
+static LLVMValueRef
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+	      unsigned maxprefix, bool inclusive)
+{
+	LLVMValueRef result, tmp;
+
+	if (ctx->chip_class >= GFX10) {
+		result = inclusive ? src : identity;
+	} else {
+		if (!inclusive)
+			src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+		result = src;
+	}
+	if (maxprefix <= 1)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 2)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 3)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 4)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 8)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 16)
+		return result;
+
+	if (ctx->chip_class >= GFX10) {
+		/* dpp_row_bcast{15,31} are not supported on gfx10. */
+		LLVMBuilderRef builder = ctx->builder;
+		LLVMValueRef tid = ac_get_thread_id(ctx);
+		LLVMValueRef cc;
+		/* TODO-GFX10: Can we get better code-gen by putting this into
+		 * a branch so that LLVM generates EXEC mask manipulations? */
+		if (inclusive)
+			tmp = result;
+		else
+			tmp = ac_build_alu_op(ctx, result, src, op);
+		tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
+		tmp = ac_build_alu_op(ctx, result, tmp, op);
+		cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
+		cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
+		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		if (maxprefix <= 32)
+			return result;
+
+		if (inclusive)
+			tmp = result;
+		else
+			tmp = ac_build_alu_op(ctx, result, src, op);
+		tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
+		tmp = ac_build_alu_op(ctx, result, tmp, op);
+		cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
+				   LLVMConstInt(ctx->i32, 32, false), "");
+		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		return result;
+	}
+
+	tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	if (maxprefix <= 32)
+		return result;
+	tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+	result = ac_build_alu_op(ctx, result, tmp, op);
+	return result;
+}
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+	LLVMValueRef result;
+
+	if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+		LLVMBuilderRef builder = ctx->builder;
+		src = LLVMBuildZExt(builder, src, ctx->i32, "");
+		result = ac_build_ballot(ctx, src);
+		result = ac_build_mbcnt(ctx, result);
+		result = LLVMBuildAdd(builder, result, src, "");
+		return result;
+	}
+
+	ac_build_optimization_barrier(ctx, &src);
+
+	LLVMValueRef identity =
+		get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+	result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+				  LLVMTypeOf(identity), "");
+	result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
+
+	return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+	LLVMValueRef result;
+
+	if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+		LLVMBuilderRef builder = ctx->builder;
+		src = LLVMBuildZExt(builder, src, ctx->i32, "");
+		result = ac_build_ballot(ctx, src);
+		result = ac_build_mbcnt(ctx, result);
+		return result;
+	}
+
+	ac_build_optimization_barrier(ctx, &src);
+
+	LLVMValueRef identity =
+		get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+	result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+				  LLVMTypeOf(identity), "");
+	result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
+
+	return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
+{
+	if (cluster_size == 1) return src;
+	ac_build_optimization_barrier(ctx, &src);
+	LLVMValueRef result, swap;
+	LLVMValueRef identity = get_reduction_identity(ctx, op,
+								ac_get_type_size(LLVMTypeOf(src)));
+	result = LLVMBuildBitCast(ctx->builder,
+								ac_build_set_inactive(ctx, src, identity),
+								LLVMTypeOf(identity), "");
+	swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+	result = ac_build_alu_op(ctx, result, swap, op);
+	if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+	swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+	result = ac_build_alu_op(ctx, result, swap, op);
+	if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+	if (ctx->chip_class >= GFX8)
+		swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
+	else
+		swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
+	result = ac_build_alu_op(ctx, result, swap, op);
+	if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+	if (ctx->chip_class >= GFX8)
+		swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
+	else
+		swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
+	result = ac_build_alu_op(ctx, result, swap, op);
+	if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+	if (ctx->chip_class >= GFX10)
+		swap = ac_build_permlane16(ctx, result, 0, true, false);
+	else if (ctx->chip_class >= GFX8 && cluster_size != 32)
+		swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+	else
+		swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
+	result = ac_build_alu_op(ctx, result, swap, op);
+	if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+	if (ctx->chip_class >= GFX8) {
+		if (ctx->chip_class >= GFX10)
+			swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+		else
+			swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+		result = ac_build_alu_op(ctx, result, swap, op);
+		result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
+		return ac_build_wwm(ctx, result);
+	} else {
+		swap = ac_build_readlane(ctx, result, ctx->i32_0);
+		result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
+		result = ac_build_alu_op(ctx, result, swap, op);
+		return ac_build_wwm(ctx, result);
+	}
+}
+
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	if (ws->maxwaves <= 1)
+		return;
+
+	const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef tid = ac_get_thread_id(ctx);
+	LLVMValueRef tmp;
+
+	tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
+	ac_build_ifcc(ctx, tmp, 1000);
+	LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+	ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	const LLVMTypeRef type = LLVMTypeOf(ws->src);
+	const LLVMValueRef identity =
+		get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+	if (ws->maxwaves <= 1) {
+		ws->result_reduce = ws->src;
+		ws->result_inclusive = ws->src;
+		ws->result_exclusive = identity;
+		return;
+	}
+	assert(ws->maxwaves <= 32);
+
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef tid = ac_get_thread_id(ctx);
+	LLVMBasicBlockRef bbs[2];
+	LLVMValueRef phivalues_scan[2];
+	LLVMValueRef tmp, tmp2;
+
+	bbs[0] = LLVMGetInsertBlock(builder);
+	phivalues_scan[0] = LLVMGetUndef(type);
+
+	if (ws->enable_reduce)
+		tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+	else if (ws->enable_inclusive)
+		tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+	else
+		tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+	ac_build_ifcc(ctx, tmp, 1001);
+	{
+		tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+		ac_build_optimization_barrier(ctx, &tmp);
+
+		bbs[1] = LLVMGetInsertBlock(builder);
+		phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
+	}
+	ac_build_endif(ctx, 1001);
+
+	const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+	if (ws->enable_reduce) {
+		tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+		ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+	}
+	if (ws->enable_inclusive)
+		ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+	if (ws->enable_exclusive) {
+		tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+		tmp = ac_build_readlane(ctx, scan, tmp);
+		tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+		ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+	}
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	ac_build_wg_wavescan_top(ctx, ws);
+	ac_build_s_barrier(ctx);
+	ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	if (ws->enable_exclusive) {
+		ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+		if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
+			ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
+		ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+	} else {
+		ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+	}
+
+	bool enable_inclusive = ws->enable_inclusive;
+	bool enable_exclusive = ws->enable_exclusive;
+	ws->enable_inclusive = false;
+	ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+	ac_build_wg_wavescan_top(ctx, ws);
+	ws->enable_inclusive = enable_inclusive;
+	ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	bool enable_inclusive = ws->enable_inclusive;
+	bool enable_exclusive = ws->enable_exclusive;
+	ws->enable_inclusive = false;
+	ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+	ac_build_wg_wavescan_bottom(ctx, ws);
+	ws->enable_inclusive = enable_inclusive;
+	ws->enable_exclusive = enable_exclusive;
+
+	/* ws->result_reduce is already the correct value */
+	if (ws->enable_inclusive)
+		ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
+	if (ws->enable_exclusive)
+		ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+	ac_build_wg_scan_top(ctx, ws);
+	ac_build_s_barrier(ctx);
+	ac_build_wg_scan_bottom(ctx, ws);
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+		unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+	unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+	if (ctx->chip_class >= GFX8) {
+		return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+	} else {
+		return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
+	}
+}
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
+{
+	index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+	return ac_build_intrinsic(ctx,
+		  "llvm.amdgcn.ds.bpermute", ctx->i32,
+		  (LLVMValueRef []) {index, src}, 2,
+		  AC_FUNC_ATTR_READNONE |
+		  AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
+		   unsigned bitsize)
+{
+	LLVMTypeRef type;
+	char *intr;
+
+	if (bitsize == 16) {
+		intr = "llvm.amdgcn.frexp.exp.i16.f16";
+		type = ctx->i16;
+	} else if (bitsize == 32) {
+		intr = "llvm.amdgcn.frexp.exp.i32.f32";
+		type = ctx->i32;
+	} else {
+		intr = "llvm.amdgcn.frexp.exp.i32.f64";
+		type = ctx->i32;
+	}
+
+	LLVMValueRef params[] = {
+		src0,
+	};
+	return ac_build_intrinsic(ctx, intr, type, params, 1,
+				  AC_FUNC_ATTR_READNONE);
+}
+LLVMValueRef
+ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
+		    unsigned bitsize)
+{
+	LLVMTypeRef type;
+	char *intr;
+
+	if (bitsize == 16) {
+		intr = "llvm.amdgcn.frexp.mant.f16";
+		type = ctx->f16;
+	} else if (bitsize == 32) {
+		intr = "llvm.amdgcn.frexp.mant.f32";
+		type = ctx->f32;
+	} else {
+		intr = "llvm.amdgcn.frexp.mant.f64";
+		type = ctx->f64;
+	}
+
+	LLVMValueRef params[] = {
+		src0,
+	};
+	return ac_build_intrinsic(ctx, intr, type, params, 1,
+				  AC_FUNC_ATTR_READNONE);
+}
+
+/*
+ * this takes an I,J coordinate pair,
+ * and works out the X and Y derivatives.
+ * it returns DDX(I), DDX(J), DDY(I), DDY(J).
+ */
+LLVMValueRef
+ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
+{
+	LLVMValueRef result[4], a;
+	unsigned i;
+
+	for (i = 0; i < 2; i++) {
+		a = LLVMBuildExtractElement(ctx->builder, interp_ij,
+					    LLVMConstInt(ctx->i32, i, false), "");
+		result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
+		result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
+	}
+	return ac_build_gather_values(ctx, result, 4);
+}
+
+LLVMValueRef
+ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
+{
+	LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
+						 ctx->i1, NULL, 0,
+						 AC_FUNC_ATTR_READNONE);
+	result = LLVMBuildNot(ctx->builder, result, "");
+	return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
+}
+
+LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
+			   LLVMValueRef *args, unsigned num_args)
+{
+	LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
+	LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
+	return ret;
+}
+
+void
+ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth,
+		LLVMValueRef stencil, LLVMValueRef samplemask,
+		struct ac_export_args *args)
+{
+	unsigned mask = 0;
+	unsigned format = ac_get_spi_shader_z_format(depth != NULL,
+						     stencil != NULL,
+						     samplemask != NULL);
+
+	assert(depth || stencil || samplemask);
+
+	memset(args, 0, sizeof(*args));
+
+	args->valid_mask = 1; /* whether the EXEC mask is valid */
+	args->done = 1; /* DONE bit */
+
+	/* Specify the target we are exporting */
+	args->target = V_008DFC_SQ_EXP_MRTZ;
+
+	args->compr = 0; /* COMP flag */
+	args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
+	args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
+	args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
+	args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
+
+	if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
+		assert(!depth);
+		args->compr = 1; /* COMPR flag */
+
+		if (stencil) {
+			/* Stencil should be in X[23:16]. */
+			stencil = ac_to_integer(ctx, stencil);
+			stencil = LLVMBuildShl(ctx->builder, stencil,
+					       LLVMConstInt(ctx->i32, 16, 0), "");
+			args->out[0] = ac_to_float(ctx, stencil);
+			mask |= 0x3;
+		}
+		if (samplemask) {
+			/* SampleMask should be in Y[15:0]. */
+			args->out[1] = samplemask;
+			mask |= 0xc;
+		}
+	} else {
+		if (depth) {
+			args->out[0] = depth;
+			mask |= 0x1;
+		}
+		if (stencil) {
+			args->out[1] = stencil;
+			mask |= 0x2;
+		}
+		if (samplemask) {
+			args->out[2] = samplemask;
+			mask |= 0x4;
+		}
+	}
+
+	/* GFX6 (except OLAND and HAINAN) has a bug that it only looks
+	 * at the X writemask component. */
+	if (ctx->chip_class == GFX6 &&
+	    ctx->family != CHIP_OLAND &&
+	    ctx->family != CHIP_HAINAN)
+		mask |= 0x1;
+
+	/* Specify which components to enable */
+	args->enabled_channels = mask;
+}
+
diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h
new file mode 100644
index 00000000000..013bf00041a
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_build.h
@@ -0,0 +1,744 @@
+/*
+ * Copyright 2016 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+#ifndef AC_LLVM_BUILD_H
+#define AC_LLVM_BUILD_H
+
+#include <stdbool.h>
+#include <llvm-c/Core.h>
+#include "compiler/nir/nir.h"
+#include "amd_family.h"
+#include "ac_shader_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+	AC_ADDR_SPACE_FLAT = 0, /* Slower than global. */
+	AC_ADDR_SPACE_GLOBAL = 1,
+	AC_ADDR_SPACE_GDS = 2,
+	AC_ADDR_SPACE_LDS = 3,
+	AC_ADDR_SPACE_CONST = 4, /* Global allowing SMEM. */
+	AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */
+};
+
+#define AC_WAIT_LGKM	(1 << 0) /* LDS, GDS, constant, message */
+#define AC_WAIT_VLOAD	(1 << 1) /* VMEM load/sample instructions */
+#define AC_WAIT_VSTORE	(1 << 2) /* VMEM store instructions */
+
+struct ac_llvm_flow;
+struct ac_llvm_compiler;
+enum ac_float_mode;
+
+struct ac_llvm_flow_state {
+	struct ac_llvm_flow *stack;
+	unsigned depth_max;
+	unsigned depth;
+};
+
+struct ac_llvm_context {
+	LLVMContextRef context;
+	LLVMModuleRef module;
+	LLVMBuilderRef builder;
+
+	LLVMTypeRef voidt;
+	LLVMTypeRef i1;
+	LLVMTypeRef i8;
+	LLVMTypeRef i16;
+	LLVMTypeRef i32;
+	LLVMTypeRef i64;
+	LLVMTypeRef intptr;
+	LLVMTypeRef f16;
+	LLVMTypeRef f32;
+	LLVMTypeRef f64;
+	LLVMTypeRef v2i16;
+	LLVMTypeRef v2i32;
+	LLVMTypeRef v3i32;
+	LLVMTypeRef v4i32;
+	LLVMTypeRef v2f32;
+	LLVMTypeRef v3f32;
+	LLVMTypeRef v4f32;
+	LLVMTypeRef v8i32;
+	LLVMTypeRef iN_wavemask;
+	LLVMTypeRef iN_ballotmask;
+
+	LLVMValueRef i8_0;
+	LLVMValueRef i8_1;
+	LLVMValueRef i16_0;
+	LLVMValueRef i16_1;
+	LLVMValueRef i32_0;
+	LLVMValueRef i32_1;
+	LLVMValueRef i64_0;
+	LLVMValueRef i64_1;
+	LLVMValueRef f16_0;
+	LLVMValueRef f16_1;
+	LLVMValueRef f32_0;
+	LLVMValueRef f32_1;
+	LLVMValueRef f64_0;
+	LLVMValueRef f64_1;
+	LLVMValueRef i1true;
+	LLVMValueRef i1false;
+
+	/* Since ac_nir_translate makes a local copy of ac_llvm_context, there
+	 * are two ac_llvm_contexts. Declare a pointer here, so that the control
+	 * flow stack is shared by both ac_llvm_contexts.
+	 */
+	struct ac_llvm_flow_state *flow;
+
+	unsigned range_md_kind;
+	unsigned invariant_load_md_kind;
+	unsigned uniform_md_kind;
+	unsigned fpmath_md_kind;
+	LLVMValueRef fpmath_md_2p5_ulp;
+	LLVMValueRef empty_md;
+
+	enum chip_class chip_class;
+	enum radeon_family family;
+
+	unsigned wave_size;
+	unsigned ballot_mask_bits;
+
+	LLVMValueRef lds;
+};
+
+void
+ac_llvm_context_init(struct ac_llvm_context *ctx,
+		     struct ac_llvm_compiler *compiler,
+		     enum chip_class chip_class, enum radeon_family family,
+		     enum ac_float_mode float_mode, unsigned wave_size,
+		     unsigned ballot_mask_bits);
+
+void
+ac_llvm_context_dispose(struct ac_llvm_context *ctx);
+
+int
+ac_get_llvm_num_components(LLVMValueRef value);
+
+int
+ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type);
+
+LLVMValueRef
+ac_llvm_extract_elem(struct ac_llvm_context *ac,
+		     LLVMValueRef value,
+		     int index);
+
+unsigned ac_get_type_size(LLVMTypeRef type);
+
+LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t);
+LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v);
+LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v);
+LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t);
+LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v);
+
+LLVMValueRef
+ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
+		   LLVMTypeRef return_type, LLVMValueRef *params,
+		   unsigned param_count, unsigned attrib_mask);
+
+void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize);
+
+LLVMValueRef
+ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+	     unsigned count_incoming, LLVMValueRef *values,
+	     LLVMBasicBlockRef *blocks);
+
+void ac_build_s_barrier(struct ac_llvm_context *ctx);
+void ac_build_optimization_barrier(struct ac_llvm_context *ctx,
+				   LLVMValueRef *pvgpr);
+
+LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx);
+
+LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value);
+LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
+				 LLVMValueRef value);
+
+LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value);
+
+LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value);
+
+LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value);
+
+LLVMValueRef
+ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
+			       unsigned value_count, unsigned component);
+
+LLVMValueRef
+ac_build_gather_values_extended(struct ac_llvm_context *ctx,
+				LLVMValueRef *values,
+				unsigned value_count,
+				unsigned value_stride,
+				bool load,
+				bool always_vector);
+LLVMValueRef
+ac_build_gather_values(struct ac_llvm_context *ctx,
+		       LLVMValueRef *values,
+		       unsigned value_count);
+
+LLVMValueRef
+ac_extract_components(struct ac_llvm_context *ctx,
+		      LLVMValueRef value,
+		      unsigned start,
+		      unsigned channels);
+
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+				     LLVMValueRef value,
+				     unsigned num_channels);
+LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value);
+
+LLVMValueRef
+ac_build_fdiv(struct ac_llvm_context *ctx,
+	      LLVMValueRef num,
+	      LLVMValueRef den);
+
+LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
+				LLVMValueRef num,
+				LLVMValueRef multiplier,
+				LLVMValueRef pre_shift,
+				LLVMValueRef post_shift,
+				LLVMValueRef increment);
+LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
+				    LLVMValueRef num,
+				    LLVMValueRef multiplier,
+				    LLVMValueRef pre_shift,
+				    LLVMValueRef post_shift,
+				    LLVMValueRef increment);
+LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
+					      LLVMValueRef num,
+					      LLVMValueRef multiplier,
+					      LLVMValueRef post_shift);
+
+void
+ac_prepare_cube_coords(struct ac_llvm_context *ctx,
+		       bool is_deriv, bool is_array, bool is_lod,
+		       LLVMValueRef *coords_arg,
+		       LLVMValueRef *derivs_arg);
+
+
+LLVMValueRef
+ac_build_fs_interp(struct ac_llvm_context *ctx,
+		   LLVMValueRef llvm_chan,
+		   LLVMValueRef attr_number,
+		   LLVMValueRef params,
+		   LLVMValueRef i,
+		   LLVMValueRef j);
+
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params,
+		       LLVMValueRef i,
+		       LLVMValueRef j);
+
+LLVMValueRef
+ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
+		       LLVMValueRef parameter,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params);
+
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index);
+
+LLVMValueRef
+ac_build_gep0(struct ac_llvm_context *ctx,
+	      LLVMValueRef base_ptr,
+	      LLVMValueRef index);
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				  LLVMValueRef index);
+
+void
+ac_build_indexed_store(struct ac_llvm_context *ctx,
+		       LLVMValueRef base_ptr, LLVMValueRef index,
+		       LLVMValueRef value);
+
+LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
+			   LLVMValueRef index);
+LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
+				     LLVMValueRef base_ptr, LLVMValueRef index);
+LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index);
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index);
+
+void
+ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vdata,
+			    unsigned num_channels,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    unsigned inst_offset,
+			    unsigned cache_policy,
+			    bool swizzle_enable_hint);
+
+void
+ac_build_buffer_store_format(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef data,
+			     LLVMValueRef vindex,
+			     LLVMValueRef voffset,
+			     unsigned num_channels,
+			     unsigned cache_policy);
+
+LLVMValueRef
+ac_build_buffer_load(struct ac_llvm_context *ctx,
+		     LLVMValueRef rsrc,
+		     int num_channels,
+		     LLVMValueRef vindex,
+		     LLVMValueRef voffset,
+		     LLVMValueRef soffset,
+		     unsigned inst_offset,
+		     unsigned cache_policy,
+		     bool can_speculate,
+		     bool allow_smem);
+
+LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
+					 LLVMValueRef rsrc,
+					 LLVMValueRef vindex,
+					 LLVMValueRef voffset,
+					 unsigned num_channels,
+					 unsigned cache_policy,
+					 bool can_speculate);
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    LLVMValueRef immoffset,
+			    unsigned cache_policy);
+
+LLVMValueRef
+ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
+			   LLVMValueRef rsrc,
+			   LLVMValueRef voffset,
+			   LLVMValueRef soffset,
+			   LLVMValueRef immoffset,
+			   unsigned cache_policy);
+
+LLVMValueRef
+ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef vindex,
+			     LLVMValueRef voffset,
+			     LLVMValueRef soffset,
+			     LLVMValueRef immoffset,
+			     unsigned num_channels,
+			     unsigned dfmt,
+			     unsigned nfmt,
+			     unsigned cache_policy,
+			     bool can_speculate);
+
+LLVMValueRef
+ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
+			  LLVMValueRef rsrc,
+			  LLVMValueRef voffset,
+			  LLVMValueRef soffset,
+			  LLVMValueRef immoffset,
+			  unsigned num_channels,
+			  unsigned dfmt,
+			  unsigned nfmt,
+			  unsigned cache_policy,
+		          bool can_speculate);
+
+/* For ac_build_fetch_format.
+ *
+ * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi).
+ */
+enum {
+	AC_FETCH_FORMAT_FLOAT = 0,
+	AC_FETCH_FORMAT_FIXED,
+	AC_FETCH_FORMAT_UNORM,
+	AC_FETCH_FORMAT_SNORM,
+	AC_FETCH_FORMAT_USCALED,
+	AC_FETCH_FORMAT_SSCALED,
+	AC_FETCH_FORMAT_UINT,
+	AC_FETCH_FORMAT_SINT,
+};
+
+LLVMValueRef
+ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
+			       unsigned log_size,
+			       unsigned num_channels,
+			       unsigned format,
+			       bool reverse,
+			       bool known_aligned,
+			       LLVMValueRef rsrc,
+			       LLVMValueRef vindex,
+			       LLVMValueRef voffset,
+			       LLVMValueRef soffset,
+			       unsigned cache_policy,
+			       bool can_speculate);
+
+void
+ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
+			     LLVMValueRef rsrc,
+			     LLVMValueRef vdata,
+			     LLVMValueRef voffset,
+			     LLVMValueRef soffset,
+			     unsigned cache_policy);
+
+void
+ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vdata,
+			    LLVMValueRef voffset,
+			    LLVMValueRef soffset,
+			    unsigned cache_policy);
+
+void
+ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
+			      LLVMValueRef rsrc,
+			      LLVMValueRef vdata,
+			      LLVMValueRef vindex,
+			      LLVMValueRef voffset,
+			      LLVMValueRef soffset,
+			      LLVMValueRef immoffset,
+			      unsigned num_channels,
+			      unsigned dfmt,
+			      unsigned nfmt,
+			      unsigned cache_policy);
+
+void
+ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
+			   LLVMValueRef rsrc,
+			   LLVMValueRef vdata,
+			   LLVMValueRef voffset,
+			   LLVMValueRef soffset,
+			   LLVMValueRef immoffset,
+			   unsigned num_channels,
+			   unsigned dfmt,
+			   unsigned nfmt,
+			   unsigned cache_policy);
+
+LLVMValueRef
+ac_get_thread_id(struct ac_llvm_context *ctx);
+
+#define AC_TID_MASK_TOP_LEFT 0xfffffffc
+#define AC_TID_MASK_TOP      0xfffffffd
+#define AC_TID_MASK_LEFT     0xfffffffe
+
+LLVMValueRef
+ac_build_ddxy(struct ac_llvm_context *ctx,
+	      uint32_t mask,
+	      int idx,
+	      LLVMValueRef val);
+
+#define AC_SENDMSG_GS 2
+#define AC_SENDMSG_GS_DONE 3
+#define AC_SENDMSG_GS_ALLOC_REQ 9
+
+#define AC_SENDMSG_GS_OP_NOP      (0 << 4)
+#define AC_SENDMSG_GS_OP_CUT      (1 << 4)
+#define AC_SENDMSG_GS_OP_EMIT     (2 << 4)
+#define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4)
+
+void ac_build_sendmsg(struct ac_llvm_context *ctx,
+		      uint32_t msg,
+		      LLVMValueRef wave_id);
+
+LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx,
+			   LLVMValueRef arg,
+			   LLVMTypeRef dst_type);
+
+LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx,
+			  LLVMValueRef arg,
+			  LLVMTypeRef dst_type);
+LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b);
+LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b);
+LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b);
+LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
+			   LLVMValueRef b);
+LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b);
+LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b);
+LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value);
+
+struct ac_export_args {
+	LLVMValueRef out[4];
+        unsigned target;
+        unsigned enabled_channels;
+        bool compr;
+        bool done;
+        bool valid_mask;
+};
+
+void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a);
+
+void ac_build_export_null(struct ac_llvm_context *ctx);
+
+enum ac_image_opcode {
+	ac_image_sample,
+	ac_image_gather4,
+	ac_image_load,
+	ac_image_load_mip,
+	ac_image_store,
+	ac_image_store_mip,
+	ac_image_get_lod,
+	ac_image_get_resinfo,
+	ac_image_atomic,
+	ac_image_atomic_cmpswap,
+};
+
+enum ac_atomic_op {
+	ac_atomic_swap,
+	ac_atomic_add,
+	ac_atomic_sub,
+	ac_atomic_smin,
+	ac_atomic_umin,
+	ac_atomic_smax,
+	ac_atomic_umax,
+	ac_atomic_and,
+	ac_atomic_or,
+	ac_atomic_xor,
+	ac_atomic_inc_wrap,
+	ac_atomic_dec_wrap,
+};
+
+/* These cache policy bits match the definitions used by the LLVM intrinsics. */
+enum ac_image_cache_policy {
+	ac_glc = 1 << 0, /* per-CU cache control */
+	ac_slc = 1 << 1, /* global L2 cache control */
+	ac_dlc = 1 << 2, /* per-shader-array cache control */
+};
+
+struct ac_image_args {
+	enum ac_image_opcode opcode : 4;
+	enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */
+	enum ac_image_dim dim : 3;
+	unsigned dmask : 4;
+	unsigned cache_policy : 3;
+	bool unorm : 1;
+	bool level_zero : 1;
+	unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */
+
+	LLVMValueRef resource;
+	LLVMValueRef sampler;
+	LLVMValueRef data[2]; /* data[0] is source data (vector); data[1] is cmp for cmpswap */
+	LLVMValueRef offset;
+	LLVMValueRef bias;
+	LLVMValueRef compare;
+	LLVMValueRef derivs[6];
+	LLVMValueRef coords[4];
+	LLVMValueRef lod; // also used by ac_image_get_resinfo
+};
+
+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+				   struct ac_image_args *a);
+LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx,
+					     LLVMValueRef rsrc);
+LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
+				    LLVMValueRef args[2]);
+LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
+				     LLVMValueRef args[2]);
+LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
+				     LLVMValueRef args[2]);
+LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
+				 LLVMValueRef args[2], unsigned bits, bool hi);
+LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
+				 LLVMValueRef args[2], unsigned bits, bool hi);
+LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1);
+void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1);
+LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
+			  LLVMValueRef offset, LLVMValueRef width,
+			  bool is_signed);
+LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+			   LLVMValueRef s1, LLVMValueRef s2);
+LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+			   LLVMValueRef s1, LLVMValueRef s2);
+
+void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags);
+
+LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			   unsigned bitsize);
+
+LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    LLVMValueRef src1, LLVMValueRef src2,
+			    unsigned bitsize);
+
+LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    unsigned bitsize);
+
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+			    unsigned bitsize);
+
+LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0);
+
+LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
+				       LLVMValueRef src0);
+
+void ac_optimize_vs_outputs(struct ac_llvm_context *ac,
+			    LLVMValueRef main_fn,
+			    uint8_t *vs_output_param_offset,
+			    uint32_t num_outputs,
+			    uint8_t *num_param_exports);
+void ac_init_exec_full_mask(struct ac_llvm_context *ctx);
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ac);
+LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
+			 LLVMValueRef dw_addr);
+void ac_lds_store(struct ac_llvm_context *ctx,
+		  LLVMValueRef dw_addr, LLVMValueRef value);
+
+LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
+			 LLVMTypeRef dst_type,
+			 LLVMValueRef src0);
+
+LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type);
+LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type);
+
+void ac_build_bgnloop(struct ac_llvm_context *ctx, int lable_id);
+void ac_build_break(struct ac_llvm_context *ctx);
+void ac_build_continue(struct ac_llvm_context *ctx);
+void ac_build_else(struct ac_llvm_context *ctx, int lable_id);
+void ac_build_endif(struct ac_llvm_context *ctx, int lable_id);
+void ac_build_endloop(struct ac_llvm_context *ctx, int lable_id);
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id);
+void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
+		 int lable_id);
+void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
+		  int lable_id);
+
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+			     const char *name);
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
+				   const char *name);
+
+LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+			 LLVMTypeRef type);
+
+LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
+			    unsigned count);
+
+LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
+			     unsigned rshift, unsigned bitwidth);
+
+void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
+			      LLVMValueRef *addr, bool is_array_tex);
+
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask);
+
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane);
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane);
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size);
+
+/**
+ * Common arguments for a scan/reduce operation that accumulates per-wave
+ * values across an entire workgroup, while respecting the order of waves.
+ */
+struct ac_wg_scan {
+	bool enable_reduce;
+	bool enable_exclusive;
+	bool enable_inclusive;
+	nir_op op;
+	LLVMValueRef src; /* clobbered! */
+	LLVMValueRef result_reduce;
+	LLVMValueRef result_exclusive;
+	LLVMValueRef result_inclusive;
+	LLVMValueRef extra;
+	LLVMValueRef waveidx;
+	LLVMValueRef numwaves; /* only needed for "reduce" operations */
+
+	/* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */
+	LLVMValueRef scratch;
+	unsigned maxwaves;
+};
+
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+		unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index);
+
+LLVMValueRef
+ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
+		   unsigned bitsize);
+
+LLVMValueRef
+ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
+		    unsigned bitsize);
+
+LLVMValueRef
+ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij);
+
+LLVMValueRef
+ac_build_load_helper_invocation(struct ac_llvm_context *ctx);
+
+LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
+			   LLVMValueRef *args, unsigned num_args);
+
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,
+				 LLVMValueRef ptr, LLVMValueRef val,
+				 const char *sync_scope);
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				      LLVMValueRef cmp, LLVMValueRef val,
+				      const char *sync_scope);
+
+void
+ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth,
+		LLVMValueRef stencil, LLVMValueRef samplemask,
+		struct ac_export_args *args);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/amd/llvm/ac_llvm_cull.c b/src/amd/llvm/ac_llvm_cull.c
new file mode 100644
index 00000000000..1c2da3e0418
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_cull.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#include "ac_llvm_cull.h"
+#include <llvm-c/Core.h>
+
+struct ac_position_w_info {
+	/* If a primitive intersects the W=0 plane, it causes a reflection
+	 * of the determinant used for face culling. Every vertex behind
+	 * the W=0 plane negates the determinant, so having 2 vertices behind
+	 * the plane has no effect. This is i1 true if the determinant should be
+	 * negated.
+	 */
+	LLVMValueRef w_reflection;
+
+	/* If we simplify the "-w <= p <= w" view culling equation, we get
+	 * "-w <= w", which can't be satisfied when w is negative.
+	 * In perspective projection, a negative W means that the primitive
+	 * is behind the viewer, but the equation is independent of the type
+	 * of projection.
+	 *
+	 * w_accepted is false when all W are negative and therefore
+	 * the primitive is invisible.
+	 */
+	LLVMValueRef w_accepted;
+
+	LLVMValueRef all_w_positive;
+	LLVMValueRef any_w_negative;
+};
+
+static void ac_analyze_position_w(struct ac_llvm_context *ctx,
+				  LLVMValueRef pos[3][4],
+				  struct ac_position_w_info *w)
+{
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef all_w_negative = ctx->i1true;
+
+	w->w_reflection = ctx->i1false;
+	w->any_w_negative = ctx->i1false;
+
+	for (unsigned i = 0; i < 3; i++) {
+		LLVMValueRef neg_w;
+
+		neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
+		/* If neg_w is true, negate w_reflection. */
+		w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, "");
+		w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, "");
+		all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, "");
+	}
+	w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, "");
+	w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
+}
+
+/* Perform front/back face culling and return true if the primitive is accepted. */
+static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx,
+				 LLVMValueRef pos[3][4],
+				 struct ac_position_w_info *w,
+				 bool cull_front,
+				 bool cull_back,
+				 bool cull_zero_area)
+{
+	LLVMBuilderRef builder = ctx->builder;
+
+	if (cull_front && cull_back)
+		return ctx->i1false;
+
+	if (!cull_front && !cull_back && !cull_zero_area)
+		return ctx->i1true;
+
+	/* Front/back face culling. Also if the determinant == 0, the triangle
+	 * area is 0.
+	 */
+	LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], "");
+	LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
+	LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
+	LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
+	LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, "");
+	LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, "");
+	LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, "");
+
+	/* Negative W negates the determinant. */
+	det = LLVMBuildSelect(builder, w->w_reflection,
+			      LLVMBuildFNeg(builder, det, ""),
+			      det, "");
+
+	LLVMValueRef accepted = NULL;
+	if (cull_front) {
+		LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE;
+		accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
+	} else if (cull_back) {
+		LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE;
+		accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
+	} else if (cull_zero_area) {
+		accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
+	}
+	return accepted;
+}
+
+/* Perform view culling and small primitive elimination and return true
+ * if the primitive is accepted and initially_accepted == true. */
+static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      struct ac_position_w_info *w,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      bool cull_view_xy,
+			      bool cull_view_near_z,
+			      bool cull_view_far_z,
+			      bool cull_small_prims,
+			      bool use_halfz_clip_space)
+{
+	LLVMBuilderRef builder = ctx->builder;
+
+	if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims)
+		return ctx->i1true;
+
+	/* Skip the culling if the primitive has already been rejected or
+	 * if any W is negative. The bounding box culling doesn't work when
+	 * W is negative.
+	 */
+	LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted,
+					 w->all_w_positive, "");
+	LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, "");
+	LLVMBuildStore(builder, initially_accepted, accepted_var);
+
+	ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */);
+	{
+		LLVMValueRef bbox_min[3], bbox_max[3];
+		LLVMValueRef accepted = initially_accepted;
+
+		/* Compute the primitive bounding box for easy culling. */
+		for (unsigned chan = 0; chan < 3; chan++) {
+			bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
+			bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
+
+			bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
+			bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
+		}
+
+		/* View culling. */
+		if (cull_view_xy || cull_view_near_z || cull_view_far_z) {
+			for (unsigned chan = 0; chan < 3; chan++) {
+				LLVMValueRef visible;
+
+				if ((cull_view_xy && chan <= 1) ||
+				    (cull_view_near_z && chan == 2)) {
+					float t = chan == 2 && use_halfz_clip_space ? 0 : -1;
+					visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
+								LLVMConstReal(ctx->f32, t), "");
+					accepted = LLVMBuildAnd(builder, accepted, visible, "");
+				}
+
+				if ((cull_view_xy && chan <= 1) ||
+				    (cull_view_far_z && chan == 2)) {
+					visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan],
+								ctx->f32_1, "");
+					accepted = LLVMBuildAnd(builder, accepted, visible, "");
+				}
+			}
+		}
+
+		/* Small primitive elimination. */
+		if (cull_small_prims) {
+			/* Assuming a sample position at (0.5, 0.5), if we round
+			 * the bounding box min/max extents and the results of
+			 * the rounding are equal in either the X or Y direction,
+			 * the bounding box does not intersect the sample.
+			 *
+			 * See these GDC slides for pictures:
+			 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
+			 */
+			LLVMValueRef min, max, not_equal[2], visible;
+
+			for (unsigned chan = 0; chan < 2; chan++) {
+				/* Convert the position to screen-space coordinates. */
+				min = ac_build_fmad(ctx, bbox_min[chan],
+						    vp_scale[chan], vp_translate[chan]);
+				max = ac_build_fmad(ctx, bbox_max[chan],
+						    vp_scale[chan], vp_translate[chan]);
+				/* Scale the bounding box according to the precision of
+				 * the rasterizer and the number of MSAA samples. */
+				min = LLVMBuildFSub(builder, min, small_prim_precision, "");
+				max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
+
+				/* Determine if the bbox intersects the sample point.
+				 * It also works for MSAA, but vp_scale, vp_translate,
+				 * and small_prim_precision are computed differently.
+				 */
+				min = ac_build_round(ctx, min);
+				max = ac_build_round(ctx, max);
+				not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
+			}
+			visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], "");
+			accepted = LLVMBuildAnd(builder, accepted, visible, "");
+		}
+
+		LLVMBuildStore(builder, accepted, accepted_var);
+	}
+	ac_build_endif(ctx, 10000000);
+
+	return LLVMBuildLoad(builder, accepted_var, "");
+}
+
+/**
+ * Return i1 true if the primitive is accepted (not culled).
+ *
+ * \param pos                   Vertex positions 3x vec4
+ * \param initially_accepted    AND'ed with the result. Some computations can be
+ *                              skipped if this is false.
+ * \param vp_scale              Viewport scale XY.
+ *                              For MSAA, multiply them by the number of samples.
+ * \param vp_translate          Viewport translation XY.
+ *                              For MSAA, multiply them by the number of samples.
+ * \param small_prim_precision  Precision of small primitive culling. This should
+ *                              be the same as or greater than the precision of
+ *                              the rasterizer. Set to num_samples / 2^subpixel_bits.
+ *                              subpixel_bits are defined by the quantization mode.
+ * \param options               See ac_cull_options.
+ */
+LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      struct ac_cull_options *options)
+{
+	struct ac_position_w_info w;
+	ac_analyze_position_w(ctx, pos, &w);
+
+	/* W culling. */
+	LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
+	accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, "");
+
+	/* Face culling. */
+	accepted = LLVMBuildAnd(ctx->builder, accepted,
+				ac_cull_face(ctx, pos, &w,
+					     options->cull_front,
+					     options->cull_back,
+					     options->cull_zero_area), "");
+
+	/* View culling and small primitive elimination. */
+	accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate,
+			     small_prim_precision,
+			     options->cull_view_xy,
+			     options->cull_view_near_z,
+			     options->cull_view_far_z,
+			     options->cull_small_prims,
+			     options->use_halfz_clip_space);
+	return accepted;
+}
diff --git a/src/amd/llvm/ac_llvm_cull.h b/src/amd/llvm/ac_llvm_cull.h
new file mode 100644
index 00000000000..0aa6c902a68
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_cull.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#ifndef AC_LLVM_CULL_H
+#define AC_LLVM_CULL_H
+
+#include "ac_llvm_build.h"
+
+struct ac_cull_options {
+	/* In general, I recommend setting all to true except view Z culling,
+	 * which isn't so effective because W culling is cheaper and partially
+	 * replaces near Z culling, and you don't need to set Position.z
+	 * if Z culling is disabled.
+	 *
+	 * If something doesn't work, turn some of these off to find out what.
+	 */
+	bool cull_front;
+	bool cull_back;
+	bool cull_view_xy;
+	bool cull_view_near_z;
+	bool cull_view_far_z;
+	bool cull_small_prims;
+	bool cull_zero_area;
+	bool cull_w; /* cull primitives with all W < 0 */
+
+	bool use_halfz_clip_space;
+};
+
+LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      struct ac_cull_options *options);
+
+#endif
diff --git a/src/amd/llvm/ac_llvm_helper.cpp b/src/amd/llvm/ac_llvm_helper.cpp
new file mode 100644
index 00000000000..b7a72ee3fdd
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_helper.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#include <cstring>
+
+#include "ac_binary.h"
+#include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
+
+#include "util/macros.h"
+
+#include <llvm-c/Core.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/Transforms/IPO.h>
+
+#include <llvm/IR/LegacyPassManager.h>
+
+void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
+{
+   llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
+   A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
+}
+
+bool ac_is_sgpr_param(LLVMValueRef arg)
+{
+	llvm::Argument *A = llvm::unwrap<llvm::Argument>(arg);
+	llvm::AttributeList AS = A->getParent()->getAttributes();
+	unsigned ArgNo = A->getArgNo();
+	return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg);
+}
+
+LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call)
+{
+	return LLVMGetCalledValue(call);
+}
+
+bool ac_llvm_is_function(LLVMValueRef v)
+{
+	return LLVMGetValueKind(v) == LLVMFunctionValueKind;
+}
+
+LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx)
+{
+   llvm::TargetMachine *TM = reinterpret_cast<llvm::TargetMachine*>(tm);
+   LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx);
+
+   llvm::unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple());
+   llvm::unwrap(module)->setDataLayout(TM->createDataLayout());
+   return module;
+}
+
+LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
+				 enum ac_float_mode float_mode)
+{
+	LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);
+
+	llvm::FastMathFlags flags;
+
+	switch (float_mode) {
+	case AC_FLOAT_MODE_DEFAULT:
+		break;
+	case AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH:
+		flags.setNoSignedZeros();
+		llvm::unwrap(builder)->setFastMathFlags(flags);
+		break;
+	case AC_FLOAT_MODE_UNSAFE_FP_MATH:
+		flags.setFast();
+		llvm::unwrap(builder)->setFastMathFlags(flags);
+		break;
+	}
+
+	return builder;
+}
+
+LLVMTargetLibraryInfoRef
+ac_create_target_library_info(const char *triple)
+{
+	return reinterpret_cast<LLVMTargetLibraryInfoRef>(new llvm::TargetLibraryInfoImpl(llvm::Triple(triple)));
+}
+
+void
+ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info)
+{
+	delete reinterpret_cast<llvm::TargetLibraryInfoImpl *>(library_info);
+}
+
+/* Implementation of raw_pwrite_stream that works on malloc()ed memory for
+ * better compatibility with C code. */
+struct raw_memory_ostream : public llvm::raw_pwrite_stream {
+	char *buffer;
+	size_t written;
+	size_t bufsize;
+
+	raw_memory_ostream()
+	{
+		buffer = NULL;
+		written = 0;
+		bufsize = 0;
+		SetUnbuffered();
+	}
+
+	~raw_memory_ostream()
+	{
+		free(buffer);
+	}
+
+	void clear()
+	{
+		written = 0;
+	}
+
+	void take(char *&out_buffer, size_t &out_size)
+	{
+		out_buffer = buffer;
+		out_size = written;
+		buffer = NULL;
+		written = 0;
+		bufsize = 0;
+	}
+
+	void flush() = delete;
+
+	void write_impl(const char *ptr, size_t size) override
+	{
+		if (unlikely(written + size < written))
+			abort();
+		if (written + size > bufsize) {
+			bufsize = MAX3(1024, written + size, bufsize / 3 * 4);
+			buffer = (char *)realloc(buffer, bufsize);
+			if (!buffer) {
+				fprintf(stderr, "amd: out of memory allocating ELF buffer\n");
+				abort();
+			}
+		}
+		memcpy(buffer + written, ptr, size);
+		written += size;
+	}
+
+	void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override
+	{
+		assert(offset == (size_t)offset &&
+		       offset + size >= offset && offset + size <= written);
+		memcpy(buffer + offset, ptr, size);
+	}
+
+	uint64_t current_pos() const override
+	{
+		return written;
+	}
+};
+
+/* The LLVM compiler is represented as a pass manager containing passes for
+ * optimizations, instruction selection, and code generation.
+ */
+struct ac_compiler_passes {
+	raw_memory_ostream ostream; /* ELF shader binary stream */
+	llvm::legacy::PassManager passmgr; /* list of passes */
+};
+
+struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm)
+{
+	struct ac_compiler_passes *p = new ac_compiler_passes();
+	if (!p)
+		return NULL;
+
+	llvm::TargetMachine *TM = reinterpret_cast<llvm::TargetMachine*>(tm);
+
+	if (TM->addPassesToEmitFile(p->passmgr, p->ostream,
+				    nullptr,
+				    llvm::TargetMachine::CGFT_ObjectFile)) {
+		fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n");
+		delete p;
+		return NULL;
+	}
+	return p;
+}
+
+void ac_destroy_llvm_passes(struct ac_compiler_passes *p)
+{
+	delete p;
+}
+
+/* This returns false on failure. */
+bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module,
+			      char **pelf_buffer, size_t *pelf_size)
+{
+	p->passmgr.run(*llvm::unwrap(module));
+	p->ostream.take(*pelf_buffer, *pelf_size);
+	return true;
+}
+
+void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr)
+{
+	llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass());
+}
+
+void ac_enable_global_isel(LLVMTargetMachineRef tm)
+{
+  reinterpret_cast<llvm::TargetMachine*>(tm)->setGlobalISel(true);
+}
+
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,
+				 LLVMValueRef ptr, LLVMValueRef val,
+				 const char *sync_scope) {
+	llvm::AtomicRMWInst::BinOp binop;
+	switch (op) {
+	case LLVMAtomicRMWBinOpXchg:
+		binop = llvm::AtomicRMWInst::Xchg;
+		break;
+	case LLVMAtomicRMWBinOpAdd:
+		binop = llvm::AtomicRMWInst::Add;
+		break;
+	case LLVMAtomicRMWBinOpSub:
+		binop = llvm::AtomicRMWInst::Sub;
+		break;
+	case LLVMAtomicRMWBinOpAnd:
+		binop = llvm::AtomicRMWInst::And;
+		break;
+	case LLVMAtomicRMWBinOpNand:
+		binop = llvm::AtomicRMWInst::Nand;
+		break;
+	case LLVMAtomicRMWBinOpOr:
+		binop = llvm::AtomicRMWInst::Or;
+		break;
+	case LLVMAtomicRMWBinOpXor:
+		binop = llvm::AtomicRMWInst::Xor;
+		break;
+	case LLVMAtomicRMWBinOpMax:
+		binop = llvm::AtomicRMWInst::Max;
+		break;
+	case LLVMAtomicRMWBinOpMin:
+		binop = llvm::AtomicRMWInst::Min;
+		break;
+	case LLVMAtomicRMWBinOpUMax:
+		binop = llvm::AtomicRMWInst::UMax;
+		break;
+	case LLVMAtomicRMWBinOpUMin:
+		binop = llvm::AtomicRMWInst::UMin;
+		break;
+	default:
+		unreachable(!"invalid LLVMAtomicRMWBinOp");
+	   break;
+	}
+	unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
+	return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW(
+		binop, llvm::unwrap(ptr), llvm::unwrap(val),
+		llvm::AtomicOrdering::SequentiallyConsistent, SSID));
+}
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				      LLVMValueRef cmp, LLVMValueRef val,
+				      const char *sync_scope) {
+	unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
+	return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicCmpXchg(
+			  llvm::unwrap(ptr), llvm::unwrap(cmp), llvm::unwrap(val),
+			  llvm::AtomicOrdering::SequentiallyConsistent,
+			  llvm::AtomicOrdering::SequentiallyConsistent, SSID));
+}
diff --git a/src/amd/llvm/ac_llvm_util.c b/src/amd/llvm/ac_llvm_util.c
new file mode 100644
index 00000000000..ddc8fee839b
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_util.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+/* based on pieces from si_pipe.c and radeon_llvm_emit.c */
+#include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
+#include "util/bitscan.h"
+#include <llvm-c/Core.h>
+#include <llvm-c/Support.h>
+#include <llvm-c/Transforms/IPO.h>
+#include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/Transforms/Utils.h>
+#include "c11/threads.h"
+#include "gallivm/lp_bld_misc.h"
+#include "util/u_math.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+static void ac_init_llvm_target()
+{
+	LLVMInitializeAMDGPUTargetInfo();
+	LLVMInitializeAMDGPUTarget();
+	LLVMInitializeAMDGPUTargetMC();
+	LLVMInitializeAMDGPUAsmPrinter();
+
+	/* For inline assembly. */
+	LLVMInitializeAMDGPUAsmParser();
+
+	/* For ACO disassembly. */
+	LLVMInitializeAMDGPUDisassembler();
+
+	/* Workaround for bug in llvm 4.0 that causes image intrinsics
+	 * to disappear.
+	 * https://reviews.llvm.org/D26348
+	 *
+	 * "mesa" is the prefix for error messages.
+	 *
+	 * -global-isel-abort=2 is a no-op unless global isel has been enabled.
+	 * This option tells the backend to fall-back to SelectionDAG and print
+	 * a diagnostic message if global isel fails.
+	 */
+	const char *argv[] = {
+		"mesa",
+		"-simplifycfg-sink-common=false",
+		"-global-isel-abort=2",
+#if LLVM_VERSION_MAJOR >= 10
+		/* Atomic optimizations require LLVM 10.0 for gfx10 support. */
+		"-amdgpu-atomic-optimizations=true",
+#endif
+	};
+	LLVMParseCommandLineOptions(ARRAY_SIZE(argv), argv, NULL);
+}
+
+static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT;
+
+void ac_init_llvm_once(void)
+{
+	call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target);
+}
+
+static LLVMTargetRef ac_get_llvm_target(const char *triple)
+{
+	LLVMTargetRef target = NULL;
+	char *err_message = NULL;
+
+	if (LLVMGetTargetFromTriple(triple, &target, &err_message)) {
+		fprintf(stderr, "Cannot find target for triple %s ", triple);
+		if (err_message) {
+			fprintf(stderr, "%s\n", err_message);
+		}
+		LLVMDisposeMessage(err_message);
+		return NULL;
+	}
+	return target;
+}
+
+const char *ac_get_llvm_processor_name(enum radeon_family family)
+{
+	switch (family) {
+	case CHIP_TAHITI:
+		return "tahiti";
+	case CHIP_PITCAIRN:
+		return "pitcairn";
+	case CHIP_VERDE:
+		return "verde";
+	case CHIP_OLAND:
+		return "oland";
+	case CHIP_HAINAN:
+		return "hainan";
+	case CHIP_BONAIRE:
+		return "bonaire";
+	case CHIP_KABINI:
+		return "kabini";
+	case CHIP_KAVERI:
+		return "kaveri";
+	case CHIP_HAWAII:
+		return "hawaii";
+	case CHIP_TONGA:
+		return "tonga";
+	case CHIP_ICELAND:
+		return "iceland";
+	case CHIP_CARRIZO:
+		return "carrizo";
+	case CHIP_FIJI:
+		return "fiji";
+	case CHIP_STONEY:
+		return "stoney";
+	case CHIP_POLARIS10:
+		return "polaris10";
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
+		return "polaris11";
+	case CHIP_VEGA10:
+		return "gfx900";
+	case CHIP_RAVEN:
+		return "gfx902";
+	case CHIP_VEGA12:
+		return "gfx904";
+	case CHIP_VEGA20:
+		return "gfx906";
+	case CHIP_RAVEN2:
+	case CHIP_RENOIR:
+		return "gfx909";
+	case CHIP_ARCTURUS:
+		return "gfx908";
+	case CHIP_NAVI10:
+		return "gfx1010";
+	case CHIP_NAVI12:
+		return "gfx1011";
+	case CHIP_NAVI14:
+		return "gfx1012";
+	default:
+		return "";
+	}
+}
+
+static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
+						     enum ac_target_machine_options tm_options,
+						     LLVMCodeGenOptLevel level,
+						     const char **out_triple)
+{
+	assert(family >= CHIP_TAHITI);
+	char features[256];
+	const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--";
+	LLVMTargetRef target = ac_get_llvm_target(triple);
+
+	snprintf(features, sizeof(features),
+		 "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s",
+		 family >= CHIP_NAVI10 && !(tm_options & AC_TM_WAVE32) ?
+			 ",+wavefrontsize64,-wavefrontsize32" : "",
+		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
+		 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
+		 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
+		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "",
+		 tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : "");
+
+	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
+	                             target,
+	                             triple,
+	                             ac_get_llvm_processor_name(family),
+				     features,
+	                             level,
+	                             LLVMRelocDefault,
+	                             LLVMCodeModelDefault);
+
+	if (out_triple)
+		*out_triple = triple;
+	if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL)
+		ac_enable_global_isel(tm);
+	return tm;
+}
+
+static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info,
+					    bool check_ir)
+{
+	LLVMPassManagerRef passmgr = LLVMCreatePassManager();
+	if (!passmgr)
+		return NULL;
+
+	if (target_library_info)
+		LLVMAddTargetLibraryInfo(target_library_info,
+					 passmgr);
+
+	if (check_ir)
+		LLVMAddVerifierPass(passmgr);
+	LLVMAddAlwaysInlinerPass(passmgr);
+	/* Normally, the pass manager runs all passes on one function before
+	 * moving onto another. Adding a barrier no-op pass forces the pass
+	 * manager to run the inliner on all functions first, which makes sure
+	 * that the following passes are only run on the remaining non-inline
+	 * function, so it removes useless work done on dead inline functions.
+	 */
+	ac_llvm_add_barrier_noop_pass(passmgr);
+	/* This pass should eliminate all the load and store instructions. */
+	LLVMAddPromoteMemoryToRegisterPass(passmgr);
+	LLVMAddScalarReplAggregatesPass(passmgr);
+	LLVMAddLICMPass(passmgr);
+	LLVMAddAggressiveDCEPass(passmgr);
+	LLVMAddCFGSimplificationPass(passmgr);
+	/* This is recommended by the instruction combining pass. */
+	LLVMAddEarlyCSEMemSSAPass(passmgr);
+	LLVMAddInstructionCombiningPass(passmgr);
+	return passmgr;
+}
+
+static const char *attr_to_str(enum ac_func_attr attr)
+{
+   switch (attr) {
+   case AC_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline";
+   case AC_FUNC_ATTR_INREG: return "inreg";
+   case AC_FUNC_ATTR_NOALIAS: return "noalias";
+   case AC_FUNC_ATTR_NOUNWIND: return "nounwind";
+   case AC_FUNC_ATTR_READNONE: return "readnone";
+   case AC_FUNC_ATTR_READONLY: return "readonly";
+   case AC_FUNC_ATTR_WRITEONLY: return "writeonly";
+   case AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly";
+   case AC_FUNC_ATTR_CONVERGENT: return "convergent";
+   default:
+	   fprintf(stderr, "Unhandled function attribute: %x\n", attr);
+	   return 0;
+   }
+}
+
+void
+ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function,
+                     int attr_idx, enum ac_func_attr attr)
+{
+   const char *attr_name = attr_to_str(attr);
+   unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name,
+                                                      strlen(attr_name));
+   LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0);
+
+   if (LLVMIsAFunction(function))
+      LLVMAddAttributeAtIndex(function, attr_idx, llvm_attr);
+   else
+      LLVMAddCallSiteAttribute(function, attr_idx, llvm_attr);
+}
+
+void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function,
+			    unsigned attrib_mask)
+{
+	attrib_mask |= AC_FUNC_ATTR_NOUNWIND;
+	attrib_mask &= ~AC_FUNC_ATTR_LEGACY;
+
+	while (attrib_mask) {
+		enum ac_func_attr attr = 1u << u_bit_scan(&attrib_mask);
+		ac_add_function_attr(ctx, function, -1, attr);
+	}
+}
+
+void
+ac_dump_module(LLVMModuleRef module)
+{
+	char *str = LLVMPrintModuleToString(module);
+	fprintf(stderr, "%s", str);
+	LLVMDisposeMessage(str);
+}
+
+void
+ac_llvm_add_target_dep_function_attr(LLVMValueRef F,
+				     const char *name, unsigned value)
+{
+	char str[16];
+
+	snprintf(str, sizeof(str), "0x%x", value);
+	LLVMAddTargetDependentFunctionAttr(F, name, str);
+}
+
+void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size)
+{
+	if (!size)
+		return;
+
+	char str[32];
+	snprintf(str, sizeof(str), "%u,%u", size, size);
+	LLVMAddTargetDependentFunctionAttr(F, "amdgpu-flat-work-group-size", str);
+}
+
+unsigned
+ac_count_scratch_private_memory(LLVMValueRef function)
+{
+	unsigned private_mem_vgprs = 0;
+
+	/* Process all LLVM instructions. */
+	LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(function);
+	while (bb) {
+		LLVMValueRef next = LLVMGetFirstInstruction(bb);
+
+		while (next) {
+			LLVMValueRef inst = next;
+			next = LLVMGetNextInstruction(next);
+
+			if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
+				continue;
+
+			LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
+			/* No idea why LLVM aligns allocas to 4 elements. */
+			unsigned alignment = LLVMGetAlignment(inst);
+			unsigned dw_size = align(ac_get_type_size(type) / 4, alignment);
+			private_mem_vgprs += dw_size;
+		}
+		bb = LLVMGetNextBasicBlock(bb);
+	}
+
+	return private_mem_vgprs;
+}
+
+bool
+ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
+		      enum radeon_family family,
+		      enum ac_target_machine_options tm_options)
+{
+	const char *triple;
+	memset(compiler, 0, sizeof(*compiler));
+
+	compiler->tm = ac_create_target_machine(family, tm_options,
+						LLVMCodeGenLevelDefault,
+						&triple);
+	if (!compiler->tm)
+		return false;
+
+	if (tm_options & AC_TM_CREATE_LOW_OPT) {
+		compiler->low_opt_tm =
+			ac_create_target_machine(family, tm_options,
+						 LLVMCodeGenLevelLess, NULL);
+		if (!compiler->low_opt_tm)
+			goto fail;
+	}
+
+	if (family >= CHIP_NAVI10) {
+		assert(!(tm_options & AC_TM_CREATE_LOW_OPT));
+		compiler->tm_wave32 = ac_create_target_machine(family,
+							       tm_options | AC_TM_WAVE32,
+							       LLVMCodeGenLevelDefault,
+							       NULL);
+		if (!compiler->tm_wave32)
+			goto fail;
+	}
+
+	compiler->target_library_info =
+		ac_create_target_library_info(triple);
+	if (!compiler->target_library_info)
+		goto fail;
+
+	compiler->passmgr = ac_create_passmgr(compiler->target_library_info,
+					      tm_options & AC_TM_CHECK_IR);
+	if (!compiler->passmgr)
+		goto fail;
+
+	return true;
+fail:
+	ac_destroy_llvm_compiler(compiler);
+	return false;
+}
+
+void
+ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
+{
+	ac_destroy_llvm_passes(compiler->passes);
+	ac_destroy_llvm_passes(compiler->passes_wave32);
+	ac_destroy_llvm_passes(compiler->low_opt_passes);
+
+	if (compiler->passmgr)
+		LLVMDisposePassManager(compiler->passmgr);
+	if (compiler->target_library_info)
+		ac_dispose_target_library_info(compiler->target_library_info);
+	if (compiler->low_opt_tm)
+		LLVMDisposeTargetMachine(compiler->low_opt_tm);
+	if (compiler->tm)
+		LLVMDisposeTargetMachine(compiler->tm);
+	if (compiler->tm_wave32)
+		LLVMDisposeTargetMachine(compiler->tm_wave32);
+}
diff --git a/src/amd/llvm/ac_llvm_util.h b/src/amd/llvm/ac_llvm_util.h
new file mode 100644
index 00000000000..60c9a17e447
--- /dev/null
+++ b/src/amd/llvm/ac_llvm_util.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2016 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#ifndef AC_LLVM_UTIL_H
+#define AC_LLVM_UTIL_H
+
+#include <stdbool.h>
+#include <llvm-c/TargetMachine.h>
+#include <llvm/Config/llvm-config.h>
+
+#include "amd_family.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ac_compiler_passes;
+
+enum ac_func_attr {
+	AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0),
+	AC_FUNC_ATTR_INREG        = (1 << 2),
+	AC_FUNC_ATTR_NOALIAS      = (1 << 3),
+	AC_FUNC_ATTR_NOUNWIND     = (1 << 4),
+	AC_FUNC_ATTR_READNONE     = (1 << 5),
+	AC_FUNC_ATTR_READONLY     = (1 << 6),
+	AC_FUNC_ATTR_WRITEONLY    = (1 << 7),
+	AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = (1 << 8),
+	AC_FUNC_ATTR_CONVERGENT = (1 << 9),
+
+	/* Legacy intrinsic that needs attributes on function declarations
+	 * and they must match the internal LLVM definition exactly, otherwise
+	 * intrinsic selection fails.
+	 */
+	AC_FUNC_ATTR_LEGACY       = (1u << 31),
+};
+
+enum ac_target_machine_options {
+	AC_TM_SUPPORTS_SPILL = (1 << 0),
+	AC_TM_SISCHED = (1 << 1),
+	AC_TM_FORCE_ENABLE_XNACK = (1 << 2),
+	AC_TM_FORCE_DISABLE_XNACK = (1 << 3),
+	AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4),
+	AC_TM_CHECK_IR = (1 << 5),
+	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
+	AC_TM_CREATE_LOW_OPT = (1 << 7),
+	AC_TM_NO_LOAD_STORE_OPT = (1 << 8),
+	AC_TM_WAVE32 = (1 << 9),
+};
+
+enum ac_float_mode {
+	AC_FLOAT_MODE_DEFAULT,
+	AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
+	AC_FLOAT_MODE_UNSAFE_FP_MATH,
+};
+
+/* Per-thread persistent LLVM objects. */
+struct ac_llvm_compiler {
+	LLVMTargetLibraryInfoRef	target_library_info;
+	LLVMPassManagerRef		passmgr;
+
+	/* Default compiler. */
+	LLVMTargetMachineRef		tm;
+	struct ac_compiler_passes	*passes;
+
+	/* Wave32 compiler for GFX10. */
+	LLVMTargetMachineRef		tm_wave32;
+	struct ac_compiler_passes	*passes_wave32;
+
+	/* Optional compiler for faster compilation with fewer optimizations.
+	 * LLVM modules can be created with "tm" too. There is no difference.
+	 */
+	LLVMTargetMachineRef		low_opt_tm; /* uses -O1 instead of -O2 */
+	struct ac_compiler_passes	*low_opt_passes;
+};
+
+const char *ac_get_llvm_processor_name(enum radeon_family family);
+void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
+bool ac_is_sgpr_param(LLVMValueRef param);
+void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function,
+                          int attr_idx, enum ac_func_attr attr);
+void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function,
+			    unsigned attrib_mask);
+void ac_dump_module(LLVMModuleRef module);
+
+LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call);
+bool ac_llvm_is_function(LLVMValueRef v);
+LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx);
+
+LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
+				 enum ac_float_mode float_mode);
+
+void
+ac_llvm_add_target_dep_function_attr(LLVMValueRef F,
+				     const char *name, unsigned value);
+void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size);
+
+static inline unsigned
+ac_get_load_intr_attribs(bool can_speculate)
+{
+	/* READNONE means writes can't affect it, while READONLY means that
+	 * writes can affect it. */
+	return can_speculate ? AC_FUNC_ATTR_READNONE :
+			       AC_FUNC_ATTR_READONLY;
+}
+
+unsigned
+ac_count_scratch_private_memory(LLVMValueRef function);
+
+LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple);
+void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info);
+void ac_init_llvm_once(void);
+
+
+bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
+			   enum radeon_family family,
+			   enum ac_target_machine_options tm_options);
+void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler);
+
+struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm);
+void ac_destroy_llvm_passes(struct ac_compiler_passes *p);
+bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module,
+			      char **pelf_buffer, size_t *pelf_size);
+void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr);
+void ac_enable_global_isel(LLVMTargetMachineRef tm);
+
+static inline bool
+ac_has_vec3_support(enum chip_class chip, bool use_format)
+{
+	if (chip == GFX6 && !use_format) {
+		/* GFX6 only supports vec3 with load/store format. */
+		return false;
+	}
+
+	return LLVM_VERSION_MAJOR >= 9;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* AC_LLVM_UTIL_H */
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
new file mode 100644
index 00000000000..bb99c736f78
--- /dev/null
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -0,0 +1,4944 @@
+/*
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <llvm/Config/llvm-config.h>
+
+#include "ac_nir_to_llvm.h"
+#include "ac_llvm_build.h"
+#include "ac_llvm_util.h"
+#include "ac_binary.h"
+#include "sid.h"
+#include "nir/nir.h"
+#include "nir/nir_deref.h"
+#include "util/bitscan.h"
+#include "util/u_math.h"
+#include "ac_shader_abi.h"
+#include "ac_shader_util.h"
+
+struct ac_nir_context {
+	struct ac_llvm_context ac;
+	struct ac_shader_abi *abi;
+
+	gl_shader_stage stage;
+	shader_info *info;
+
+	LLVMValueRef *ssa_defs;
+
+	LLVMValueRef scratch;
+	LLVMValueRef constant_data;
+
+	struct hash_table *defs;
+	struct hash_table *phis;
+	struct hash_table *vars;
+
+	LLVMValueRef main_function;
+	LLVMBasicBlockRef continue_block;
+	LLVMBasicBlockRef break_block;
+
+	int num_locals;
+	LLVMValueRef *locals;
+};
+
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+				     nir_deref_instr *deref_instr,
+				     enum ac_descriptor_type desc_type,
+				     const nir_instr *instr,
+				     bool image, bool write);
+
+static void
+build_store_values_extended(struct ac_llvm_context *ac,
+			     LLVMValueRef *values,
+			     unsigned value_count,
+			     unsigned value_stride,
+			     LLVMValueRef vec)
+{
+	LLVMBuilderRef builder = ac->builder;
+	unsigned i;
+
+	for (i = 0; i < value_count; i++) {
+		LLVMValueRef ptr = values[i * value_stride];
+		LLVMValueRef index = LLVMConstInt(ac->i32, i, false);
+		LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, "");
+		LLVMBuildStore(builder, value, ptr);
+	}
+}
+
+static LLVMTypeRef get_def_type(struct ac_nir_context *ctx,
+                                const nir_ssa_def *def)
+{
+	LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size);
+	if (def->num_components > 1) {
+		type = LLVMVectorType(type, def->num_components);
+	}
+	return type;
+}
+
+static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src)
+{
+	assert(src.is_ssa);
+	return nir->ssa_defs[src.ssa->index];
+}
+
+static LLVMValueRef
+get_memory_ptr(struct ac_nir_context *ctx, nir_src src)
+{
+	LLVMValueRef ptr = get_src(ctx, src);
+	ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, "");
+	int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+
+	return LLVMBuildBitCast(ctx->ac.builder, ptr,
+				LLVMPointerType(ctx->ac.i32, addr_space), "");
+}
+
+static LLVMBasicBlockRef get_block(struct ac_nir_context *nir,
+                                   const struct nir_block *b)
+{
+	struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b);
+	return (LLVMBasicBlockRef)entry->data;
+}
+
+static LLVMValueRef get_alu_src(struct ac_nir_context *ctx,
+                                nir_alu_src src,
+                                unsigned num_components)
+{
+	LLVMValueRef value = get_src(ctx, src.src);
+	bool need_swizzle = false;
+
+	assert(value);
+	unsigned src_components = ac_get_llvm_num_components(value);
+	for (unsigned i = 0; i < num_components; ++i) {
+		assert(src.swizzle[i] < src_components);
+		if (src.swizzle[i] != i)
+			need_swizzle = true;
+	}
+
+	if (need_swizzle || num_components != src_components) {
+		LLVMValueRef masks[] = {
+		    LLVMConstInt(ctx->ac.i32, src.swizzle[0], false),
+		    LLVMConstInt(ctx->ac.i32, src.swizzle[1], false),
+		    LLVMConstInt(ctx->ac.i32, src.swizzle[2], false),
+		    LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)};
+
+		if (src_components > 1 && num_components == 1) {
+			value = LLVMBuildExtractElement(ctx->ac.builder, value,
+			                                masks[0], "");
+		} else if (src_components == 1 && num_components > 1) {
+			LLVMValueRef values[] = {value, value, value, value};
+			value = ac_build_gather_values(&ctx->ac, values, num_components);
+		} else {
+			LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
+			value = LLVMBuildShuffleVector(ctx->ac.builder, value, value,
+		                                       swizzle, "");
+		}
+	}
+	assert(!src.negate);
+	assert(!src.abs);
+	return value;
+}
+
+static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
+                                 LLVMIntPredicate pred, LLVMValueRef src0,
+                                 LLVMValueRef src1)
+{
+	LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
+	return LLVMBuildSelect(ctx->builder, result,
+	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
+	                       ctx->i32_0, "");
+}
+
+static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx,
+                                   LLVMRealPredicate pred, LLVMValueRef src0,
+                                   LLVMValueRef src1)
+{
+	LLVMValueRef result;
+	src0 = ac_to_float(ctx, src0);
+	src1 = ac_to_float(ctx, src1);
+	result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
+	return LLVMBuildSelect(ctx->builder, result,
+	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
+			       ctx->i32_0, "");
+}
+
+static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
+					 const char *intrin,
+					 LLVMTypeRef result_type,
+					 LLVMValueRef src0)
+{
+	char name[64];
+	LLVMValueRef params[] = {
+		ac_to_float(ctx, src0),
+	};
+
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 ac_get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
+				       const char *intrin,
+				       LLVMTypeRef result_type,
+				       LLVMValueRef src0, LLVMValueRef src1)
+{
+	char name[64];
+	LLVMValueRef params[] = {
+		ac_to_float(ctx, src0),
+		ac_to_float(ctx, src1),
+	};
+
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 ac_get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
+					 const char *intrin,
+					 LLVMTypeRef result_type,
+					 LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
+{
+	char name[64];
+	LLVMValueRef params[] = {
+		ac_to_float(ctx, src0),
+		ac_to_float(ctx, src1),
+		ac_to_float(ctx, src2),
+	};
+
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 ac_get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx,
+			       LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
+{
+	assert(LLVMGetTypeKind(LLVMTypeOf(src0)) != LLVMVectorTypeKind);
+
+	LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0,
+				       ctx->i32_0, "");
+	return LLVMBuildSelect(ctx->builder, v,
+			       ac_to_integer_or_pointer(ctx, src1),
+			       ac_to_integer_or_pointer(ctx, src2), "");
+}
+
+static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx,
+			      LLVMValueRef src0)
+{
+	return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, ""));
+}
+
+static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx,
+				    const char *intrin,
+				    LLVMValueRef src0, LLVMValueRef src1)
+{
+	LLVMTypeRef ret_type;
+	LLVMTypeRef types[] = { ctx->i32, ctx->i1 };
+	LLVMValueRef res;
+	LLVMValueRef params[] = { src0, src1 };
+	ret_type = LLVMStructTypeInContext(ctx->context, types,
+					   2, true);
+
+	res = ac_build_intrinsic(ctx, intrin, ret_type,
+				 params, 2, AC_FUNC_ATTR_READNONE);
+
+	res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
+	res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
+	return res;
+}
+
+static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0,
+			     unsigned bitsize)
+{
+	LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0,
+					   LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""),
+					   "");
+	result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, "");
+
+	switch (bitsize) {
+	case 16:
+		return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, "");
+	case 32:
+		return result;
+	case 64:
+		return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
+	default:
+		unreachable("Unsupported bit size.");
+	}
+}
+
+static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0)
+{
+	src0 = ac_to_float(ctx, src0);
+	LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
+	return LLVMBuildSExt(ctx->builder,
+			     LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""),
+			     ctx->i32, "");
+}
+
+static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0,
+			     unsigned bitsize)
+{
+	LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
+
+	switch (bitsize) {
+	case 8:
+		return LLVMBuildTrunc(ctx->builder, result, ctx->i8, "");
+	case 16:
+		return LLVMBuildTrunc(ctx->builder, result, ctx->i16, "");
+	case 32:
+		return result;
+	case 64:
+		return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
+	default:
+		unreachable("Unsupported bit size.");
+	}
+}
+
+static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0)
+{
+	LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
+	return LLVMBuildSExt(ctx->builder,
+			     LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""),
+			     ctx->i32, "");
+}
+
+static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx,
+			       LLVMValueRef src0)
+{
+	LLVMValueRef result;
+	LLVMValueRef cond = NULL;
+
+	src0 = ac_to_float(ctx, src0);
+	result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");
+
+	if (ctx->chip_class >= GFX8) {
+		LLVMValueRef args[2];
+		/* Check if the result is a denormal - and flush to 0 if so. */
+		args[0] = result;
+		args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false);
+		cond = ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE);
+	}
+
+	/* need to convert back up to f32 */
+	result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
+
+	if (ctx->chip_class >= GFX8)
+		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
+	else {
+		/* for GFX6-GFX7 */
+		/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
+		 * so compare the result and flush to 0 if it's smaller.
+		 */
+		LLVMValueRef temp, cond2;
+		temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result);
+		cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT,
+				     LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""),
+				     temp, "");
+		cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+				      temp, ctx->f32_0, "");
+		cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
+		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
+	}
+	return result;
+}
+
+static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx,
+				   LLVMValueRef src0, LLVMValueRef src1)
+{
+	LLVMValueRef dst64, result;
+	src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
+	src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
+
+	dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
+	dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
+	result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
+	return result;
+}
+
+static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx,
+				   LLVMValueRef src0, LLVMValueRef src1)
+{
+	LLVMValueRef dst64, result;
+	src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
+	src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
+
+	dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
+	dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
+	result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
+	return result;
+}
+
+static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx,
+			     LLVMValueRef bits, LLVMValueRef offset)
+{
+	/* mask = ((1 << bits) - 1) << offset */
+	return LLVMBuildShl(ctx->builder,
+			    LLVMBuildSub(ctx->builder,
+					 LLVMBuildShl(ctx->builder,
+						      ctx->i32_1,
+						      bits, ""),
+					 ctx->i32_1, ""),
+			    offset, "");
+}
+
+static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx,
+					 LLVMValueRef mask, LLVMValueRef insert,
+					 LLVMValueRef base)
+{
+	/* Calculate:
+	 *   (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base))
+	 * Use the right-hand side, which the LLVM backend can convert to V_BFI.
+	 */
+	return LLVMBuildXor(ctx->builder, base,
+			    LLVMBuildAnd(ctx->builder, mask,
+					 LLVMBuildXor(ctx->builder, insert, base, ""), ""), "");
+}
+
+static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx,
+				   LLVMValueRef src0,
+				   LLVMValueRef (*pack)(struct ac_llvm_context *ctx,
+							LLVMValueRef args[2]))
+{
+	LLVMValueRef comp[2];
+
+	src0 = ac_to_float(ctx, src0);
+	comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
+	comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
+
+	return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, "");
+}
+
+static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx,
+					  LLVMValueRef src0)
+{
+	LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
+	LLVMValueRef temps[2], val;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
+		val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
+		val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
+		temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
+	}
+	return ac_build_gather_values(ctx, temps, 2);
+}
+
+static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
+			      nir_op op,
+			      LLVMValueRef src0)
+{
+	unsigned mask;
+	int idx;
+	LLVMValueRef result;
+
+	if (op == nir_op_fddx_fine)
+		mask = AC_TID_MASK_LEFT;
+	else if (op == nir_op_fddy_fine)
+		mask = AC_TID_MASK_TOP;
+	else
+		mask = AC_TID_MASK_TOP_LEFT;
+
+	/* for DDX we want to next X pixel, DDY next Y pixel. */
+	if (op == nir_op_fddx_fine ||
+	    op == nir_op_fddx_coarse ||
+	    op == nir_op_fddx)
+		idx = 1;
+	else
+		idx = 2;
+
+	result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
+	return result;
+}
+
+static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
+{
+	LLVMValueRef src[4], result = NULL;
+	unsigned num_components = instr->dest.dest.ssa.num_components;
+	unsigned src_components;
+	LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
+
+	assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
+	switch (instr->op) {
+	case nir_op_vec2:
+	case nir_op_vec3:
+	case nir_op_vec4:
+		src_components = 1;
+		break;
+	case nir_op_pack_half_2x16:
+	case nir_op_pack_snorm_2x16:
+	case nir_op_pack_unorm_2x16:
+		src_components = 2;
+		break;
+	case nir_op_unpack_half_2x16:
+		src_components = 1;
+		break;
+	case nir_op_cube_face_coord:
+	case nir_op_cube_face_index:
+		src_components = 3;
+		break;
+	default:
+		src_components = num_components;
+		break;
+	}
+	for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+		src[i] = get_alu_src(ctx, instr->src[i], src_components);
+
+	switch (instr->op) {
+	case nir_op_mov:
+		result = src[0];
+		break;
+	case nir_op_fneg:
+	        src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = LLVMBuildFNeg(ctx->ac.builder, src[0], "");
+		break;
+	case nir_op_ineg:
+		result = LLVMBuildNeg(ctx->ac.builder, src[0], "");
+		break;
+	case nir_op_inot:
+		result = LLVMBuildNot(ctx->ac.builder, src[0], "");
+		break;
+	case nir_op_iadd:
+		result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_fadd:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		src[1] = ac_to_float(&ctx->ac, src[1]);
+		result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_fsub:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		src[1] = ac_to_float(&ctx->ac, src[1]);
+		result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_isub:
+		result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_imul:
+		result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_imod:
+		result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_umod:
+		result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_irem:
+		result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_idiv:
+		result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_udiv:
+		result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_fmul:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		src[1] = ac_to_float(&ctx->ac, src[1]);
+		result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_frcp:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
+		break;
+	case nir_op_iand:
+		result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ior:
+		result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ixor:
+		result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ishl:
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildZExt(ctx->ac.builder, src[1],
+					       LLVMTypeOf(src[0]), "");
+		else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1],
+						LLVMTypeOf(src[0]), "");
+		result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ishr:
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildZExt(ctx->ac.builder, src[1],
+					       LLVMTypeOf(src[0]), "");
+		else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1],
+						LLVMTypeOf(src[0]), "");
+		result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ushr:
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildZExt(ctx->ac.builder, src[1],
+					       LLVMTypeOf(src[0]), "");
+		else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
+			src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1],
+						LLVMTypeOf(src[0]), "");
+		result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], "");
+		break;
+	case nir_op_ilt32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
+		break;
+	case nir_op_ine32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
+		break;
+	case nir_op_ieq32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
+		break;
+	case nir_op_ige32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
+		break;
+	case nir_op_ult32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
+		break;
+	case nir_op_uge32:
+		result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
+		break;
+	case nir_op_feq32:
+		result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
+		break;
+	case nir_op_fne32:
+		result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
+		break;
+	case nir_op_flt32:
+		result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]);
+		break;
+	case nir_op_fge32:
+		result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]);
+		break;
+	case nir_op_fabs:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_iabs:
+		result = emit_iabs(&ctx->ac, src[0]);
+		break;
+	case nir_op_imax:
+		result = ac_build_imax(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_imin:
+		result = ac_build_imin(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_umax:
+		result = ac_build_umax(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_umin:
+		result = ac_build_umin(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_isign:
+		result = ac_build_isign(&ctx->ac, src[0],
+					instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_fsign:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = ac_build_fsign(&ctx->ac, src[0],
+					instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_ffloor:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_ftrunc:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_fceil:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_fround_even:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.rint",
+		                              ac_to_float_type(&ctx->ac, def_type),src[0]);
+		break;
+	case nir_op_ffract:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = ac_build_fract(&ctx->ac, src[0],
+					instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_fsin:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sin",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_fcos:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.cos",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_fsqrt:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_fexp2:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_flog2:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		break;
+	case nir_op_frsq:
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+		result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result);
+		break;
+	case nir_op_frexp_exp:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = ac_build_frexp_exp(&ctx->ac, src[0],
+					    ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])));
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16)
+			result = LLVMBuildSExt(ctx->ac.builder, result,
+					       ctx->ac.i32, "");
+		break;
+	case nir_op_frexp_sig:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = ac_build_frexp_mant(&ctx->ac, src[0],
+					     instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_fpow:
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.pow",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
+		break;
+	case nir_op_fmax:
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
+		if (ctx->ac.chip_class < GFX9 &&
+		    instr->dest.dest.ssa.bit_size == 32) {
+			/* Only pre-GFX9 chips do not flush denorms. */
+			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
+						      ac_to_float_type(&ctx->ac, def_type),
+						      result);
+		}
+		break;
+	case nir_op_fmin:
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
+		if (ctx->ac.chip_class < GFX9 &&
+		    instr->dest.dest.ssa.bit_size == 32) {
+			/* Only pre-GFX9 chips do not flush denorms. */
+			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
+						      ac_to_float_type(&ctx->ac, def_type),
+						      result);
+		}
+		break;
+	case nir_op_ffma:
+		/* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
+		result = emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd",
+		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
+		break;
+	case nir_op_ldexp:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		if (ac_get_elem_bits(&ctx->ac, def_type) == 32)
+			result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE);
+		else if (ac_get_elem_bits(&ctx->ac, def_type) == 16)
+			result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE);
+		else
+			result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE);
+		break;
+	case nir_op_bfm:
+		result = emit_bfm(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_bitfield_select:
+		result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]);
+		break;
+	case nir_op_ubfe:
+		result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false);
+		break;
+	case nir_op_ibfe:
+		result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true);
+		break;
+	case nir_op_bitfield_reverse:
+		result = ac_build_bitfield_reverse(&ctx->ac, src[0]);
+		break;
+	case nir_op_bit_count:
+		result = ac_build_bit_count(&ctx->ac, src[0]);
+		break;
+	case nir_op_vec2:
+	case nir_op_vec3:
+	case nir_op_vec4:
+		for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+			src[i] = ac_to_integer(&ctx->ac, src[i]);
+		result = ac_build_gather_values(&ctx->ac, src, num_components);
+		break;
+	case nir_op_f2i8:
+	case nir_op_f2i16:
+	case nir_op_f2i32:
+	case nir_op_f2i64:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
+		break;
+	case nir_op_f2u8:
+	case nir_op_f2u16:
+	case nir_op_f2u32:
+	case nir_op_f2u64:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
+		break;
+	case nir_op_i2f16:
+	case nir_op_i2f32:
+	case nir_op_i2f64:
+		result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		break;
+	case nir_op_u2f16:
+	case nir_op_u2f32:
+	case nir_op_u2f64:
+		result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		break;
+	case nir_op_f2f16_rtz:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+			src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
+		LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
+		result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+		break;
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16:
+	case nir_op_f2f32:
+	case nir_op_f2f64:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+			result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		else
+			result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		break;
+	case nir_op_u2u8:
+	case nir_op_u2u16:
+	case nir_op_u2u32:
+	case nir_op_u2u64:
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+			result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, "");
+		else
+			result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
+		break;
+	case nir_op_i2i8:
+	case nir_op_i2i16:
+	case nir_op_i2i32:
+	case nir_op_i2i64:
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+			result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, "");
+		else
+			result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
+		break;
+	case nir_op_b32csel:
+		result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
+		break;
+	case nir_op_find_lsb:
+		result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]);
+		break;
+	case nir_op_ufind_msb:
+		result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32);
+		break;
+	case nir_op_ifind_msb:
+		result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32);
+		break;
+	case nir_op_uadd_carry:
+		result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
+		break;
+	case nir_op_usub_borrow:
+		result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
+		break;
+	case nir_op_b2f16:
+	case nir_op_b2f32:
+	case nir_op_b2f64:
+		result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_f2b32:
+		result = emit_f2b(&ctx->ac, src[0]);
+		break;
+	case nir_op_b2i8:
+	case nir_op_b2i16:
+	case nir_op_b2i32:
+	case nir_op_b2i64:
+		result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
+		break;
+	case nir_op_i2b32:
+		result = emit_i2b(&ctx->ac, src[0]);
+		break;
+	case nir_op_fquantize2f16:
+		result = emit_f2f16(&ctx->ac, src[0]);
+		break;
+	case nir_op_umul_high:
+		result = emit_umul_high(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_imul_high:
+		result = emit_imul_high(&ctx->ac, src[0], src[1]);
+		break;
+	case nir_op_pack_half_2x16:
+		result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16);
+		break;
+	case nir_op_pack_snorm_2x16:
+		result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16);
+		break;
+	case nir_op_pack_unorm_2x16:
+		result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16);
+		break;
+	case nir_op_unpack_half_2x16:
+		result = emit_unpack_half_2x16(&ctx->ac, src[0]);
+		break;
+	case nir_op_fddx:
+	case nir_op_fddy:
+	case nir_op_fddx_fine:
+	case nir_op_fddy_fine:
+	case nir_op_fddx_coarse:
+	case nir_op_fddy_coarse:
+		result = emit_ddxy(ctx, instr->op, src[0]);
+		break;
+
+	case nir_op_unpack_64_2x32_split_x: {
+		assert(ac_get_llvm_num_components(src[0]) == 1);
+		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+						    ctx->ac.v2i32,
+						    "");
+		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+						 ctx->ac.i32_0, "");
+		break;
+	}
+
+	case nir_op_unpack_64_2x32_split_y: {
+		assert(ac_get_llvm_num_components(src[0]) == 1);
+		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+						    ctx->ac.v2i32,
+						    "");
+		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+						 ctx->ac.i32_1, "");
+		break;
+	}
+
+	case nir_op_pack_64_2x32_split: {
+		LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
+		result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, "");
+		break;
+	}
+
+	case nir_op_pack_32_2x16_split: {
+		LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
+		result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, "");
+		break;
+	}
+
+	case nir_op_unpack_32_2x16_split_x: {
+		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+						    ctx->ac.v2i16,
+						    "");
+		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+						 ctx->ac.i32_0, "");
+		break;
+	}
+
+	case nir_op_unpack_32_2x16_split_y: {
+		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+						    ctx->ac.v2i16,
+						    "");
+		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+						 ctx->ac.i32_1, "");
+		break;
+	}
+
+	case nir_op_cube_face_coord: {
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		LLVMValueRef results[2];
+		LLVMValueRef in[3];
+		for (unsigned chan = 0; chan < 3; chan++)
+			in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
+		results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
+						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
+						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema",
+						     ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
+		results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
+		LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
+		results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
+		results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
+		result = ac_build_gather_values(&ctx->ac, results, 2);
+		break;
+	}
+
+	case nir_op_cube_face_index: {
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		LLVMValueRef in[3];
+		for (unsigned chan = 0; chan < 3; chan++)
+			in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
+		result = ac_build_intrinsic(&ctx->ac,  "llvm.amdgcn.cubeid",
+						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		break;
+	}
+
+	case nir_op_fmin3:
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
+						ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
+						ac_to_float_type(&ctx->ac, def_type), result, src[2]);
+		break;
+	case nir_op_umin3:
+		result = ac_build_umin(&ctx->ac, src[0], src[1]);
+		result = ac_build_umin(&ctx->ac, result, src[2]);
+		break;
+	case nir_op_imin3:
+		result = ac_build_imin(&ctx->ac, src[0], src[1]);
+		result = ac_build_imin(&ctx->ac, result, src[2]);
+		break;
+	case nir_op_fmax3:
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
+						ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
+						ac_to_float_type(&ctx->ac, def_type), result, src[2]);
+		break;
+	case nir_op_umax3:
+		result = ac_build_umax(&ctx->ac, src[0], src[1]);
+		result = ac_build_umax(&ctx->ac, result, src[2]);
+		break;
+	case nir_op_imax3:
+		result = ac_build_imax(&ctx->ac, src[0], src[1]);
+		result = ac_build_imax(&ctx->ac, result, src[2]);
+		break;
+	case nir_op_fmed3: {
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		src[1] = ac_to_float(&ctx->ac, src[1]);
+		src[2] = ac_to_float(&ctx->ac, src[2]);
+		result = ac_build_fmed3(&ctx->ac, src[0], src[1], src[2],
+					instr->dest.dest.ssa.bit_size);
+		break;
+	}
+	case nir_op_imed3: {
+		LLVMValueRef tmp1 = ac_build_imin(&ctx->ac, src[0], src[1]);
+		LLVMValueRef tmp2 = ac_build_imax(&ctx->ac, src[0], src[1]);
+		tmp2 = ac_build_imin(&ctx->ac, tmp2, src[2]);
+		result = ac_build_imax(&ctx->ac, tmp1, tmp2);
+		break;
+	}
+	case nir_op_umed3: {
+		LLVMValueRef tmp1 = ac_build_umin(&ctx->ac, src[0], src[1]);
+		LLVMValueRef tmp2 = ac_build_umax(&ctx->ac, src[0], src[1]);
+		tmp2 = ac_build_umin(&ctx->ac, tmp2, src[2]);
+		result = ac_build_umax(&ctx->ac, tmp1, tmp2);
+		break;
+	}
+
+	default:
+		fprintf(stderr, "Unknown NIR alu instr: ");
+		nir_print_instr(&instr->instr, stderr);
+		fprintf(stderr, "\n");
+		abort();
+	}
+
+	if (result) {
+		assert(instr->dest.dest.is_ssa);
+		result = ac_to_integer_or_pointer(&ctx->ac, result);
+		ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
+	}
+}
+
+static void visit_load_const(struct ac_nir_context *ctx,
+                             const nir_load_const_instr *instr)
+{
+	LLVMValueRef values[4], value = NULL;
+	LLVMTypeRef element_type =
+	    LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
+
+	for (unsigned i = 0; i < instr->def.num_components; ++i) {
+		switch (instr->def.bit_size) {
+		case 8:
+			values[i] = LLVMConstInt(element_type,
+			                         instr->value[i].u8, false);
+			break;
+		case 16:
+			values[i] = LLVMConstInt(element_type,
+			                         instr->value[i].u16, false);
+			break;
+		case 32:
+			values[i] = LLVMConstInt(element_type,
+			                         instr->value[i].u32, false);
+			break;
+		case 64:
+			values[i] = LLVMConstInt(element_type,
+			                         instr->value[i].u64, false);
+			break;
+		default:
+			fprintf(stderr,
+			        "unsupported nir load_const bit_size: %d\n",
+			        instr->def.bit_size);
+			abort();
+		}
+	}
+	if (instr->def.num_components > 1) {
+		value = LLVMConstVector(values, instr->def.num_components);
+	} else
+		value = values[0];
+
+	ctx->ssa_defs[instr->def.index] = value;
+}
+
+static LLVMValueRef
+get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements)
+{
+	LLVMValueRef size =
+		LLVMBuildExtractElement(ctx->ac.builder, descriptor,
+					LLVMConstInt(ctx->ac.i32, 2, false), "");
+
+	/* GFX8 only */
+	if (ctx->ac.chip_class == GFX8 && in_elements) {
+		/* On GFX8, the descriptor contains the size in bytes,
+		 * but TXQ must return the size in elements.
+		 * The stride is always non-zero for resources using TXQ.
+		 */
+		LLVMValueRef stride =
+			LLVMBuildExtractElement(ctx->ac.builder, descriptor,
+						ctx->ac.i32_1, "");
+		stride = LLVMBuildLShr(ctx->ac.builder, stride,
+				       LLVMConstInt(ctx->ac.i32, 16, false), "");
+		stride = LLVMBuildAnd(ctx->ac.builder, stride,
+				      LLVMConstInt(ctx->ac.i32, 0x3fff, false), "");
+
+		size = LLVMBuildUDiv(ctx->ac.builder, size, stride, "");
+	}
+	return size;
+}
+
+/* Gather4 should follow the same rules as bilinear filtering, but the hardware
+ * incorrectly forces nearest filtering if the texture format is integer.
+ * The only effect it has on Gather4, which always returns 4 texels for
+ * bilinear filtering, is that the final coordinates are off by 0.5 of
+ * the texel size.
+ *
+ * The workaround is to subtract 0.5 from the unnormalized coordinates,
+ * or (0.5 / size) from the normalized coordinates.
+ *
+ * However, cube textures with 8_8_8_8 data formats require a different
+ * workaround of overriding the num format to USCALED/SSCALED. This would lose
+ * precision in 32-bit data formats, so it needs to be applied dynamically at
+ * runtime. In this case, return an i1 value that indicates whether the
+ * descriptor was overridden (and hence a fixup of the sampler result is needed).
+ */
+static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx,
+					  nir_variable *var,
+					  struct ac_image_args *args,
+					  const nir_tex_instr *instr)
+{
+	const struct glsl_type *type = glsl_without_array(var->type);
+	enum glsl_base_type stype = glsl_get_sampler_result_type(type);
+	LLVMValueRef wa_8888 = NULL;
+	LLVMValueRef half_texel[2];
+	LLVMValueRef result;
+
+	assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT);
+
+	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+		LLVMValueRef formats;
+		LLVMValueRef data_format;
+		LLVMValueRef wa_formats;
+
+		formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, "");
+
+		data_format = LLVMBuildLShr(ctx->builder, formats,
+					    LLVMConstInt(ctx->i32, 20, false), "");
+		data_format = LLVMBuildAnd(ctx->builder, data_format,
+					   LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
+		wa_8888 = LLVMBuildICmp(
+			ctx->builder, LLVMIntEQ, data_format,
+			LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
+			"");
+
+		uint32_t wa_num_format =
+			stype == GLSL_TYPE_UINT ?
+			S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) :
+			S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
+		wa_formats = LLVMBuildAnd(ctx->builder, formats,
+					  LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false),
+					  "");
+		wa_formats = LLVMBuildOr(ctx->builder, wa_formats,
+					LLVMConstInt(ctx->i32, wa_num_format, false), "");
+
+		formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, "");
+		args->resource = LLVMBuildInsertElement(
+			ctx->builder, args->resource, formats, ctx->i32_1, "");
+	}
+
+	if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
+		assert(!wa_8888);
+		half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
+	} else {
+		struct ac_image_args resinfo = {};
+		LLVMBasicBlockRef bbs[2];
+
+		LLVMValueRef unnorm = NULL;
+		LLVMValueRef default_offset = ctx->f32_0;
+		if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D &&
+		    !instr->is_array) {
+			/* In vulkan, whether the sampler uses unnormalized
+			 * coordinates or not is a dynamic property of the
+			 * sampler. Hence, to figure out whether or not we
+			 * need to divide by the texture size, we need to test
+			 * the sampler at runtime. This tests the bit set by
+			 * radv_init_sampler().
+			 */
+			LLVMValueRef sampler0 =
+				LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, "");
+			sampler0 = LLVMBuildLShr(ctx->builder, sampler0,
+						 LLVMConstInt(ctx->i32, 15, false), "");
+			sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, "");
+			unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, "");
+			default_offset = LLVMConstReal(ctx->f32, -0.5);
+		}
+
+		bbs[0] = LLVMGetInsertBlock(ctx->builder);
+		if (wa_8888 || unnorm) {
+			assert(!(wa_8888 && unnorm));
+			LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm;
+			/* Skip the texture size query entirely if we don't need it. */
+			ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000);
+			bbs[1] = LLVMGetInsertBlock(ctx->builder);
+		}
+
+		/* Query the texture size. */
+		resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array);
+		resinfo.opcode = ac_image_get_resinfo;
+		resinfo.dmask = 0xf;
+		resinfo.lod = ctx->i32_0;
+		resinfo.resource = args->resource;
+		resinfo.attributes = AC_FUNC_ATTR_READNONE;
+		LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo);
+
+		/* Compute -0.5 / size. */
+		for (unsigned c = 0; c < 2; c++) {
+			half_texel[c] =
+				LLVMBuildExtractElement(ctx->builder, size,
+							LLVMConstInt(ctx->i32, c, 0), "");
+			half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
+			half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]);
+			half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c],
+						      LLVMConstReal(ctx->f32, -0.5), "");
+		}
+
+		if (wa_8888 || unnorm) {
+			ac_build_endif(ctx, 2000);
+
+			for (unsigned c = 0; c < 2; c++) {
+				LLVMValueRef values[2] = { default_offset, half_texel[c] };
+				half_texel[c] = ac_build_phi(ctx, ctx->f32, 2,
+							     values, bbs);
+			}
+		}
+	}
+
+	for (unsigned c = 0; c < 2; c++) {
+		LLVMValueRef tmp;
+		tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, "");
+		args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
+	}
+
+	args->attributes = AC_FUNC_ATTR_READNONE;
+	result = ac_build_image_opcode(ctx, args);
+
+	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+		LLVMValueRef tmp, tmp2;
+
+		/* if the cube workaround is in place, f2i the result. */
+		for (unsigned c = 0; c < 4; c++) {
+			tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
+			if (stype == GLSL_TYPE_UINT)
+				tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
+			else
+				tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
+			tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
+			tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
+			tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, "");
+			tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
+			result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
+		}
+	}
+	return result;
+}
+
+static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr)
+{
+	nir_deref_instr *texture_deref_instr = NULL;
+
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			texture_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		default:
+			break;
+		}
+	}
+	return texture_deref_instr;
+}
+
+static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
+					const nir_tex_instr *instr,
+					struct ac_image_args *args)
+{
+	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
+		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+
+		return ac_build_buffer_load_format(&ctx->ac,
+			                           args->resource,
+			                           args->coords[0],
+			                           ctx->ac.i32_0,
+			                           util_last_bit(mask),
+			                           0, true);
+	}
+
+	args->opcode = ac_image_sample;
+
+	switch (instr->op) {
+	case nir_texop_txf:
+	case nir_texop_txf_ms:
+	case nir_texop_samples_identical:
+		args->opcode = args->level_zero ||
+			       instr->sampler_dim == GLSL_SAMPLER_DIM_MS ?
+					ac_image_load : ac_image_load_mip;
+		args->level_zero = false;
+		break;
+	case nir_texop_txs:
+	case nir_texop_query_levels:
+		args->opcode = ac_image_get_resinfo;
+		if (!args->lod)
+			args->lod = ctx->ac.i32_0;
+		args->level_zero = false;
+		break;
+	case nir_texop_tex:
+		if (ctx->stage != MESA_SHADER_FRAGMENT) {
+			assert(!args->lod);
+			args->level_zero = true;
+		}
+		break;
+	case nir_texop_tg4:
+		args->opcode = ac_image_gather4;
+		args->level_zero = true;
+		break;
+	case nir_texop_lod:
+		args->opcode = ac_image_get_lod;
+		break;
+	default:
+		break;
+	}
+
+	if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) {
+		nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr);
+		nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr);
+		const struct glsl_type *type = glsl_without_array(var->type);
+		enum glsl_base_type stype = glsl_get_sampler_result_type(type);
+		if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
+			return lower_gather4_integer(&ctx->ac, var, args, instr);
+		}
+	}
+
+	/* Fixup for GFX9 which allocates 1D textures as 2D. */
+	if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) {
+		if ((args->dim == ac_image_2darray ||
+		     args->dim == ac_image_2d) && !args->coords[1]) {
+			args->coords[1] = ctx->ac.i32_0;
+		}
+	}
+
+	args->attributes = AC_FUNC_ATTR_READNONE;
+	bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE &&
+			 ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE;
+	if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) {
+		/* Prevent texture instructions with implicit derivatives from being
+		 * sinked into branches. */
+		switch (instr->op) {
+		case nir_texop_tex:
+		case nir_texop_txb:
+		case nir_texop_lod:
+			args->attributes |= AC_FUNC_ATTR_CONVERGENT;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return ac_build_image_opcode(&ctx->ac, args);
+}
+
+static LLVMValueRef visit_vulkan_resource_reindex(struct ac_nir_context *ctx,
+                                                  nir_intrinsic_instr *instr)
+{
+	LLVMValueRef ptr = get_src(ctx, instr->src[0]);
+	LLVMValueRef index = get_src(ctx, instr->src[1]);
+
+	LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
+	LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
+	return result;
+}
+
+static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
+                                             nir_intrinsic_instr *instr)
+{
+	LLVMValueRef ptr, addr;
+	LLVMValueRef src0 = get_src(ctx, instr->src[0]);
+	unsigned index = nir_intrinsic_base(instr);
+
+	addr = LLVMConstInt(ctx->ac.i32, index, 0);
+	addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
+
+	/* Load constant values from user SGPRS when possible, otherwise
+	 * fallback to the default path that loads directly from memory.
+	 */
+	if (LLVMIsConstant(src0) &&
+	    instr->dest.ssa.bit_size == 32) {
+		unsigned count = instr->dest.ssa.num_components;
+		unsigned offset = index;
+
+		offset += LLVMConstIntGetZExtValue(src0);
+		offset /= 4;
+
+		offset -= ctx->abi->base_inline_push_consts;
+
+		if (offset + count <= ctx->abi->num_inline_push_consts) {
+			return ac_build_gather_values(&ctx->ac,
+						      ctx->abi->inline_push_consts + offset,
+						      count);
+		}
+	}
+
+	ptr = LLVMBuildGEP(ctx->ac.builder, ctx->abi->push_constants, &addr, 1, "");
+
+	if (instr->dest.ssa.bit_size == 8) {
+		unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
+		LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+		ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
+		LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+
+		LLVMValueRef params[3];
+		if (load_dwords > 1) {
+			LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+			params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
+			params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
+		} else {
+			res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, "");
+			params[0] = ctx->ac.i32_0;
+			params[1] = res;
+		}
+		params[2] = addr;
+		res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0);
+
+		res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
+		if (instr->dest.ssa.num_components > 1)
+			res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), "");
+		return res;
+	} else if (instr->dest.ssa.bit_size == 16) {
+		unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
+		LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+		ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
+		LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+		res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
+		LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, "");
+		cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
+		LLVMValueRef mask[] = { LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
+					LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
+					LLVMConstInt(ctx->ac.i32, 4, false)};
+		LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components);
+		LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components);
+		LLVMValueRef shuffle_aligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, "");
+		LLVMValueRef shuffle_unaligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, "");
+		res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, "");
+		return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), "");
+	}
+
+	ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa));
+
+	return LLVMBuildLoad(ctx->ac.builder, ptr, "");
+}
+
+static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx,
+                                          const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef index = get_src(ctx, instr->src[0]);
+
+	return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false);
+}
+
+static uint32_t widen_mask(uint32_t mask, unsigned multiplier)
+{
+	uint32_t new_mask = 0;
+	for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
+		if (mask & (1u << i))
+			new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
+	return new_mask;
+}
+
+static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
+                                         unsigned start, unsigned count)
+{
+	LLVMValueRef mask[] = {
+	ctx->i32_0, ctx->i32_1,
+	LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false) };
+
+	unsigned src_elements = ac_get_llvm_num_components(src);
+
+	if (count == src_elements) {
+		assert(start == 0);
+		return src;
+	} else if (count == 1) {
+		assert(start < src_elements);
+		return LLVMBuildExtractElement(ctx->builder, src, mask[start],  "");
+	} else {
+		assert(start + count <= src_elements);
+		assert(count <= 4);
+		LLVMValueRef swizzle = LLVMConstVector(&mask[start], count);
+		return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
+	}
+}
+
+static unsigned get_cache_policy(struct ac_nir_context *ctx,
+				 enum gl_access_qualifier access,
+				 bool may_store_unaligned,
+				 bool writeonly_memory)
+{
+	unsigned cache_policy = 0;
+
+	/* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores.  All
+	 * store opcodes not aligned to a dword are affected. The only way to
+	 * get unaligned stores is through shader images.
+	 */
+	if (((may_store_unaligned && ctx->ac.chip_class == GFX6) ||
+	     /* If this is write-only, don't keep data in L1 to prevent
+	      * evicting L1 cache lines that may be needed by other
+	      * instructions.
+	      */
+	     writeonly_memory ||
+	     access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
+		cache_policy |= ac_glc;
+	}
+
+	if (access & ACCESS_STREAM_CACHE_POLICY)
+		cache_policy |= ac_slc;
+
+	return cache_policy;
+}
+
+static void visit_store_ssbo(struct ac_nir_context *ctx,
+                             nir_intrinsic_instr *instr)
+{
+	LLVMValueRef src_data = get_src(ctx, instr->src[0]);
+	int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
+	unsigned writemask = nir_intrinsic_write_mask(instr);
+	enum gl_access_qualifier access = nir_intrinsic_access(instr);
+	bool writeonly_memory = access & ACCESS_NON_READABLE;
+	unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
+
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+				        get_src(ctx, instr->src[1]), true);
+	LLVMValueRef base_data = src_data;
+	base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
+	LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
+
+	while (writemask) {
+		int start, count;
+		LLVMValueRef data, offset;
+		LLVMTypeRef data_type;
+
+		u_bit_scan_consecutive_range(&writemask, &start, &count);
+
+		/* Due to an LLVM limitation with LLVM < 9, split 3-element
+		 * writes into a 2-element and a 1-element write. */
+		if (count == 3 &&
+		    (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) {
+			writemask |= 1 << (start + 2);
+			count = 2;
+		}
+		int num_bytes = count * elem_size_bytes; /* count in bytes */
+
+		/* we can only store 4 DWords at the same time.
+		 * can only happen for 64 Bit vectors. */
+		if (num_bytes > 16) {
+			writemask |= ((1u << (count - 2)) - 1u) << (start + 2);
+			count = 2;
+			num_bytes = 16;
+		}
+
+		/* check alignment of 16 Bit stores */
+		if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) {
+			writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+			count = 1;
+			num_bytes = 2;
+		}
+		data = extract_vector_range(&ctx->ac, base_data, start, count);
+
+		offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
+				      LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");
+
+		if (num_bytes == 1) {
+			ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data,
+						    offset, ctx->ac.i32_0,
+						    cache_policy);
+		} else if (num_bytes == 2) {
+			ac_build_tbuffer_store_short(&ctx->ac, rsrc, data,
+						     offset, ctx->ac.i32_0,
+						     cache_policy);
+		} else {
+			int num_channels = num_bytes / 4;
+
+			switch (num_bytes) {
+			case 16: /* v4f32 */
+				data_type = ctx->ac.v4f32;
+				break;
+			case 12: /* v3f32 */
+				data_type = ctx->ac.v3f32;
+				break;
+			case 8: /* v2f32 */
+				data_type = ctx->ac.v2f32;
+				break;
+			case 4: /* f32 */
+				data_type = ctx->ac.f32;
+				break;
+			default:
+				unreachable("Malformed vector store.");
+			}
+			data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");
+
+			ac_build_buffer_store_dword(&ctx->ac, rsrc, data,
+						    num_channels, offset,
+						    ctx->ac.i32_0, 0,
+						    cache_policy, false);
+		}
+	}
+}
+
+static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
+                                           LLVMValueRef descriptor,
+					   LLVMValueRef offset,
+					   LLVMValueRef compare,
+					   LLVMValueRef exchange)
+{
+	LLVMBasicBlockRef start_block = NULL, then_block = NULL;
+	if (ctx->abi->robust_buffer_access) {
+		LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2);
+
+		LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, "");
+		start_block = LLVMGetInsertBlock(ctx->ac.builder);
+
+		ac_build_ifcc(&ctx->ac, cond, -1);
+
+		then_block = LLVMGetInsertBlock(ctx->ac.builder);
+	}
+
+	LLVMValueRef ptr_parts[2] = {
+		ac_llvm_extract_elem(&ctx->ac, descriptor, 0),
+		LLVMBuildAnd(ctx->ac.builder,
+		             ac_llvm_extract_elem(&ctx->ac, descriptor, 1),
+		             LLVMConstInt(ctx->ac.i32, 65535, 0), "")
+	};
+
+	ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, "");
+	ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, "");
+
+	offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, "");
+
+	LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2);
+	ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
+	ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, "");
+	ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL), "");
+
+	LLVMValueRef result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as");
+	result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
+
+	if (ctx->abi->robust_buffer_access) {
+		ac_build_endif(&ctx->ac, -1);
+
+		LLVMBasicBlockRef incoming_blocks[2] = {
+			start_block,
+			then_block,
+		};
+
+		LLVMValueRef incoming_values[2] = {
+			LLVMConstInt(ctx->ac.i64, 0, 0),
+			result,
+		};
+		LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, "");
+		LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2);
+		return ret;
+	} else {
+		return result;
+	}
+}
+
+static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
+                                      const nir_intrinsic_instr *instr)
+{
+	LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2]));
+	const char *op;
+	char name[64], type[8];
+	LLVMValueRef params[6], descriptor;
+	int arg_count = 0;
+
+	switch (instr->intrinsic) {
+	case nir_intrinsic_ssbo_atomic_add:
+		op = "add";
+		break;
+	case nir_intrinsic_ssbo_atomic_imin:
+		op = "smin";
+		break;
+	case nir_intrinsic_ssbo_atomic_umin:
+		op = "umin";
+		break;
+	case nir_intrinsic_ssbo_atomic_imax:
+		op = "smax";
+		break;
+	case nir_intrinsic_ssbo_atomic_umax:
+		op = "umax";
+		break;
+	case nir_intrinsic_ssbo_atomic_and:
+		op = "and";
+		break;
+	case nir_intrinsic_ssbo_atomic_or:
+		op = "or";
+		break;
+	case nir_intrinsic_ssbo_atomic_xor:
+		op = "xor";
+		break;
+	case nir_intrinsic_ssbo_atomic_exchange:
+		op = "swap";
+		break;
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		op = "cmpswap";
+		break;
+	default:
+		abort();
+	}
+
+	descriptor = ctx->abi->load_ssbo(ctx->abi,
+	                                 get_src(ctx, instr->src[0]),
+	                                 true);
+
+	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap &&
+	    return_type == ctx->ac.i64) {
+		return emit_ssbo_comp_swap_64(ctx, descriptor,
+					      get_src(ctx, instr->src[1]),
+					      get_src(ctx, instr->src[2]),
+					      get_src(ctx, instr->src[3]));
+	}
+	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
+		params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+	}
+	params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+	params[arg_count++] = descriptor;
+
+	if (LLVM_VERSION_MAJOR >= 9) {
+		/* XXX: The new raw/struct atomic intrinsics are buggy with
+		 * LLVM 8, see r358579.
+		 */
+		params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+		params[arg_count++] = ctx->ac.i32_0; /* soffset */
+		params[arg_count++] = ctx->ac.i32_0; /* slc */
+
+		ac_build_type_name_for_intr(return_type, type, sizeof(type));
+		snprintf(name, sizeof(name),
+		         "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
+	} else {
+		params[arg_count++] = ctx->ac.i32_0; /* vindex */
+		params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+		params[arg_count++] = ctx->ac.i1false; /* slc */
+
+		assert(return_type == ctx->ac.i32);
+		snprintf(name, sizeof(name),
+			 "llvm.amdgcn.buffer.atomic.%s", op);
+	}
+
+	return ac_build_intrinsic(&ctx->ac, name, return_type, params,
+				  arg_count, 0);
+}
+
+static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
+                                      const nir_intrinsic_instr *instr)
+{
+	int elem_size_bytes = instr->dest.ssa.bit_size / 8;
+	int num_components = instr->num_components;
+	enum gl_access_qualifier access = nir_intrinsic_access(instr);
+	unsigned cache_policy = get_cache_policy(ctx, access, false, false);
+
+	LLVMValueRef offset = get_src(ctx, instr->src[1]);
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+						get_src(ctx, instr->src[0]), false);
+	LLVMValueRef vindex = ctx->ac.i32_0;
+
+	LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
+	LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type;
+
+	LLVMValueRef results[4];
+	for (int i = 0; i < num_components;) {
+		int num_elems = num_components - i;
+		if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
+			num_elems = 1;
+		if (num_elems * elem_size_bytes > 16)
+			num_elems = 16 / elem_size_bytes;
+		int load_bytes = num_elems * elem_size_bytes;
+
+		LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false);
+
+		LLVMValueRef ret;
+
+		if (load_bytes == 1) {
+			ret = ac_build_tbuffer_load_byte(&ctx->ac,
+							  rsrc,
+							  offset,
+							  ctx->ac.i32_0,
+							  immoffset,
+							  cache_policy);
+		} else if (load_bytes == 2) {
+			ret = ac_build_tbuffer_load_short(&ctx->ac,
+							 rsrc,
+							 offset,
+							 ctx->ac.i32_0,
+							 immoffset,
+							 cache_policy);
+		} else {
+			int num_channels = util_next_power_of_two(load_bytes) / 4;
+			bool can_speculate = access & ACCESS_CAN_REORDER;
+
+			ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels,
+						   vindex, offset, immoffset, 0,
+						   cache_policy, can_speculate, false);
+		}
+
+		LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
+		ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, "");
+		ret = ac_trim_vector(&ctx->ac, ret, load_bytes);
+
+		LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems);
+		ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, "");
+
+		for (unsigned j = 0; j < num_elems; j++) {
+			results[i + j] = LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), "");
+		}
+		i += num_elems;
+	}
+
+	return ac_build_gather_values(&ctx->ac, results, num_components);
+}
+
+static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
+                                          const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef ret;
+	LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
+	LLVMValueRef offset = get_src(ctx, instr->src[1]);
+	int num_components = instr->num_components;
+
+	if (ctx->abi->load_ubo)
+		rsrc = ctx->abi->load_ubo(ctx->abi, rsrc);
+
+	if (instr->dest.ssa.bit_size == 64)
+		num_components *= 2;
+
+	if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) {
+		unsigned load_bytes = instr->dest.ssa.bit_size / 8;
+		LLVMValueRef results[num_components];
+		for (unsigned i = 0; i < num_components; ++i) {
+			LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32,
+							      load_bytes * i, 0);
+
+			if (load_bytes == 1) {
+				results[i] = ac_build_tbuffer_load_byte(&ctx->ac,
+									rsrc,
+									offset,
+									ctx->ac.i32_0,
+									immoffset,
+									0);
+			} else {
+				assert(load_bytes == 2);
+				results[i] = ac_build_tbuffer_load_short(&ctx->ac,
+									 rsrc,
+									 offset,
+									 ctx->ac.i32_0,
+									 immoffset,
+									 0);
+			}
+		}
+		ret = ac_build_gather_values(&ctx->ac, results, num_components);
+	} else {
+		ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
+					   NULL, 0, 0, true, true);
+
+		ret = ac_trim_vector(&ctx->ac, ret, num_components);
+	}
+
+	return LLVMBuildBitCast(ctx->ac.builder, ret,
+	                        get_def_type(ctx, &instr->dest.ssa), "");
+}
+
+static void
+get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr,
+                 bool vs_in, unsigned *vertex_index_out,
+                 LLVMValueRef *vertex_index_ref,
+                 unsigned *const_out, LLVMValueRef *indir_out)
+{
+	nir_variable *var = nir_deref_instr_get_variable(instr);
+	nir_deref_path path;
+	unsigned idx_lvl = 1;
+
+	nir_deref_path_init(&path, instr, NULL);
+
+	if (vertex_index_out != NULL || vertex_index_ref != NULL) {
+		if (vertex_index_ref) {
+			*vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index);
+			if (vertex_index_out)
+				*vertex_index_out = 0;
+		} else {
+			*vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index);
+		}
+		++idx_lvl;
+	}
+
+	uint32_t const_offset = 0;
+	LLVMValueRef offset = NULL;
+
+	if (var->data.compact) {
+		assert(instr->deref_type == nir_deref_type_array);
+		const_offset = nir_src_as_uint(instr->arr.index);
+		goto out;
+	}
+
+	for (; path.path[idx_lvl]; ++idx_lvl) {
+		const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type;
+		if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) {
+			unsigned index = path.path[idx_lvl]->strct.index;
+
+			for (unsigned i = 0; i < index; i++) {
+				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
+				const_offset += glsl_count_attribute_slots(ft, vs_in);
+			}
+		} else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) {
+			unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in);
+			if (nir_src_is_const(path.path[idx_lvl]->arr.index)) {
+				const_offset += size *
+					nir_src_as_uint(path.path[idx_lvl]->arr.index);
+			} else {
+				LLVMValueRef array_off = LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0),
+								      get_src(ctx, path.path[idx_lvl]->arr.index), "");
+				if (offset)
+					offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, "");
+				else
+					offset = array_off;
+			}
+		} else
+			unreachable("Uhandled deref type in get_deref_instr_offset");
+	}
+
+out:
+	nir_deref_path_finish(&path);
+
+	if (const_offset && offset)
+		offset = LLVMBuildAdd(ctx->ac.builder, offset,
+				      LLVMConstInt(ctx->ac.i32, const_offset, 0),
+				      "");
+
+	*const_out = const_offset;
+	*indir_out = offset;
+}
+
+static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx,
+				       nir_intrinsic_instr *instr,
+				       bool load_inputs)
+{
+	LLVMValueRef result;
+	LLVMValueRef vertex_index = NULL;
+	LLVMValueRef indir_index = NULL;
+	unsigned const_index = 0;
+
+	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
+	unsigned location = var->data.location;
+	unsigned driver_location = var->data.driver_location;
+	const bool is_patch =  var->data.patch;
+	const bool is_compact = var->data.compact;
+
+	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+	                 false, NULL, is_patch ? NULL : &vertex_index,
+	                 &const_index, &indir_index);
+
+	LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
+
+	LLVMTypeRef src_component_type;
+	if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
+		src_component_type = LLVMGetElementType(dest_type);
+	else
+		src_component_type = dest_type;
+
+	result = ctx->abi->load_tess_varyings(ctx->abi, src_component_type,
+					      vertex_index, indir_index,
+					      const_index, location, driver_location,
+					      var->data.location_frac,
+					      instr->num_components,
+					      is_patch, is_compact, load_inputs);
+	if (instr->dest.ssa.bit_size == 16) {
+		result = ac_to_integer(&ctx->ac, result);
+		result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
+	}
+	return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
+}
+
+static unsigned
+type_scalar_size_bytes(const struct glsl_type *type)
+{
+   assert(glsl_type_is_vector_or_scalar(type) ||
+          glsl_type_is_matrix(type));
+   return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
+}
+
+static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
+				   nir_intrinsic_instr *instr)
+{
+	nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+	nir_variable *var = nir_deref_instr_get_variable(deref);
+
+	LLVMValueRef values[8];
+	int idx = 0;
+	int ve = instr->dest.ssa.num_components;
+	unsigned comp = 0;
+	LLVMValueRef indir_index;
+	LLVMValueRef ret;
+	unsigned const_index;
+	unsigned stride = 4;
+	int mode = deref->mode;
+	
+	if (var) {
+		bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
+			var->data.mode == nir_var_shader_in;
+		idx = var->data.driver_location;
+		comp = var->data.location_frac;
+		mode = var->data.mode;
+
+		get_deref_offset(ctx, deref, vs_in, NULL, NULL,
+				 &const_index, &indir_index);
+
+		if (var->data.compact) {
+			stride = 1;
+			const_index += comp;
+			comp = 0;
+		}
+	}
+
+	if (instr->dest.ssa.bit_size == 64 &&
+	    (deref->mode == nir_var_shader_in ||
+	     deref->mode == nir_var_shader_out ||
+	     deref->mode == nir_var_function_temp))
+		ve *= 2;
+
+	switch (mode) {
+	case nir_var_shader_in:
+		if (ctx->stage == MESA_SHADER_TESS_CTRL ||
+		    ctx->stage == MESA_SHADER_TESS_EVAL) {
+			return load_tess_varyings(ctx, instr, true);
+		}
+
+		if (ctx->stage == MESA_SHADER_GEOMETRY) {
+			LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
+			LLVMValueRef indir_index;
+			unsigned const_index, vertex_index;
+			get_deref_offset(ctx, deref, false, &vertex_index, NULL,
+			                 &const_index, &indir_index);
+			assert(indir_index == NULL);
+
+			return ctx->abi->load_inputs(ctx->abi, var->data.location,
+						     var->data.driver_location,
+						     var->data.location_frac,
+						     instr->num_components, vertex_index, const_index, type);
+		}
+
+		for (unsigned chan = comp; chan < ve + comp; chan++) {
+			if (indir_index) {
+				unsigned count = glsl_count_attribute_slots(
+						var->type,
+						ctx->stage == MESA_SHADER_VERTEX);
+				count -= chan / 4;
+				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
+						&ctx->ac, ctx->abi->inputs + idx + chan, count,
+						stride, false, true);
+
+				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
+								       tmp_vec,
+								       indir_index, "");
+			} else
+				values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
+		}
+		break;
+	case nir_var_function_temp:
+		for (unsigned chan = 0; chan < ve; chan++) {
+			if (indir_index) {
+				unsigned count = glsl_count_attribute_slots(
+					var->type, false);
+				count -= chan / 4;
+				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
+						&ctx->ac, ctx->locals + idx + chan, count,
+						stride, true, true);
+
+				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
+								       tmp_vec,
+								       indir_index, "");
+			} else {
+				values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], "");
+			}
+		}
+		break;
+	case nir_var_mem_shared: {
+		LLVMValueRef address = get_src(ctx, instr->src[0]);
+		LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
+		return LLVMBuildBitCast(ctx->ac.builder, val,
+					get_def_type(ctx, &instr->dest.ssa),
+					"");
+	}
+	case nir_var_shader_out:
+		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+			return load_tess_varyings(ctx, instr, false);
+		}
+
+		if (ctx->stage == MESA_SHADER_FRAGMENT &&
+		    var->data.fb_fetch_output &&
+		    ctx->abi->emit_fbfetch)
+			return ctx->abi->emit_fbfetch(ctx->abi);
+
+		for (unsigned chan = comp; chan < ve + comp; chan++) {
+			if (indir_index) {
+				unsigned count = glsl_count_attribute_slots(
+						var->type, false);
+				count -= chan / 4;
+				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
+						&ctx->ac, ctx->abi->outputs + idx + chan, count,
+						stride, true, true);
+
+				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
+								       tmp_vec,
+								       indir_index, "");
+			} else {
+				values[chan] = LLVMBuildLoad(ctx->ac.builder,
+						     ctx->abi->outputs[idx + chan + const_index * stride],
+						     "");
+			}
+		}
+		break;
+	case nir_var_mem_global:  {
+		LLVMValueRef address = get_src(ctx, instr->src[0]);
+		unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
+		unsigned natural_stride = type_scalar_size_bytes(deref->type);
+		unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+
+		LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
+		if (stride != natural_stride) {
+			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(result_type),
+			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
+
+			for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) {
+				LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0);
+				values[i] = LLVMBuildLoad(ctx->ac.builder,
+				                          ac_build_gep_ptr(&ctx->ac, address, offset), "");
+			}
+			return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components);
+		} else {
+			LLVMTypeRef ptr_type =  LLVMPointerType(result_type,
+			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
+			LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
+			return val;
+		}
+	}
+	default:
+		unreachable("unhandle variable mode");
+	}
+	ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
+	return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
+}
+
+static void
+visit_store_var(struct ac_nir_context *ctx,
+		nir_intrinsic_instr *instr)
+{
+	nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+	nir_variable *var = nir_deref_instr_get_variable(deref);
+
+	LLVMValueRef temp_ptr, value;
+	int idx = 0;
+	unsigned comp = 0;
+	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
+	int writemask = instr->const_index[0];
+	LLVMValueRef indir_index;
+	unsigned const_index;
+
+	if (var) {
+		get_deref_offset(ctx, deref, false,
+		                 NULL, NULL, &const_index, &indir_index);
+		idx = var->data.driver_location;
+		comp = var->data.location_frac;
+
+		if (var->data.compact) {
+			const_index += comp;
+			comp = 0;
+		}
+	}
+
+	if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 &&
+	    (deref->mode == nir_var_shader_out ||
+	     deref->mode == nir_var_function_temp)) {
+
+		src = LLVMBuildBitCast(ctx->ac.builder, src,
+		                       LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
+		                       "");
+
+		writemask = widen_mask(writemask, 2);
+	}
+
+	writemask = writemask << comp;
+
+	switch (deref->mode) {
+	case nir_var_shader_out:
+
+		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+			LLVMValueRef vertex_index = NULL;
+			LLVMValueRef indir_index = NULL;
+			unsigned const_index = 0;
+			const bool is_patch = var->data.patch;
+
+			get_deref_offset(ctx, deref, false, NULL,
+			                 is_patch ? NULL : &vertex_index,
+			                 &const_index, &indir_index);
+
+			ctx->abi->store_tcs_outputs(ctx->abi, var,
+						    vertex_index, indir_index,
+						    const_index, src, writemask);
+			return;
+		}
+
+		for (unsigned chan = 0; chan < 8; chan++) {
+			int stride = 4;
+			if (!(writemask & (1 << chan)))
+				continue;
+
+			value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
+
+			if (var->data.compact)
+				stride = 1;
+			if (indir_index) {
+				unsigned count = glsl_count_attribute_slots(
+						var->type, false);
+				count -= chan / 4;
+				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
+						&ctx->ac, ctx->abi->outputs + idx + chan, count,
+						stride, true, true);
+
+				tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
+							         value, indir_index, "");
+				build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan,
+							    count, stride, tmp_vec);
+
+			} else {
+				temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride];
+
+				LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
+			}
+		}
+		break;
+	case nir_var_function_temp:
+		for (unsigned chan = 0; chan < 8; chan++) {
+			if (!(writemask & (1 << chan)))
+				continue;
+
+			value = ac_llvm_extract_elem(&ctx->ac, src, chan);
+			if (indir_index) {
+				unsigned count = glsl_count_attribute_slots(
+					var->type, false);
+				count -= chan / 4;
+				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
+					&ctx->ac, ctx->locals + idx + chan, count,
+					4, true, true);
+
+				tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
+								 value, indir_index, "");
+				build_store_values_extended(&ctx->ac, ctx->locals + idx + chan,
+							    count, 4, tmp_vec);
+			} else {
+				temp_ptr = ctx->locals[idx + chan + const_index * 4];
+
+				LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
+			}
+		}
+		break;
+
+	case nir_var_mem_global:
+	case nir_var_mem_shared: {
+		int writemask = instr->const_index[0];
+		LLVMValueRef address = get_src(ctx, instr->src[0]);
+		LLVMValueRef val = get_src(ctx, instr->src[1]);
+
+		unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
+		unsigned natural_stride = type_scalar_size_bytes(deref->type);
+		unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+
+		LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
+							LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+		address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
+
+		if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 &&
+		    stride == natural_stride) {
+			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
+			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
+
+			val = LLVMBuildBitCast(ctx->ac.builder, val,
+			                       LLVMGetElementType(LLVMTypeOf(address)), "");
+			LLVMBuildStore(ctx->ac.builder, val, address);
+		} else {
+			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(LLVMTypeOf(val)),
+			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
+			for (unsigned chan = 0; chan < 4; chan++) {
+				if (!(writemask & (1 << chan)))
+					continue;
+
+				LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0);
+
+				LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset);
+				LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val,
+									chan);
+				src = LLVMBuildBitCast(ctx->ac.builder, src,
+				                       LLVMGetElementType(LLVMTypeOf(ptr)), "");
+				LLVMBuildStore(ctx->ac.builder, src, ptr);
+			}
+		}
+		break;
+	}
+	default:
+		abort();
+		break;
+	}
+}
+
+static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
+{
+	switch (dim) {
+	case GLSL_SAMPLER_DIM_BUF:
+		return 1;
+	case GLSL_SAMPLER_DIM_1D:
+		return array ? 2 : 1;
+	case GLSL_SAMPLER_DIM_2D:
+		return array ? 3 : 2;
+	case GLSL_SAMPLER_DIM_MS:
+		return array ? 4 : 3;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		return 3;
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_SUBPASS:
+		return 2;
+	case GLSL_SAMPLER_DIM_SUBPASS_MS:
+		return 3;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
+						    LLVMValueRef coord_x, LLVMValueRef coord_y,
+						    LLVMValueRef coord_z,
+						    LLVMValueRef sample_index,
+						    LLVMValueRef fmask_desc_ptr)
+{
+	unsigned sample_chan = coord_z ? 3 : 2;
+	LLVMValueRef addr[4] = {coord_x, coord_y, coord_z};
+	addr[sample_chan] = sample_index;
+
+	ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL);
+	return addr[sample_chan];
+}
+
+static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
+{
+	assert(instr->src[0].is_ssa);
+	return nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+}
+
+static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
+                                         const nir_intrinsic_instr *instr,
+                                         enum ac_descriptor_type desc_type,
+                                         bool write)
+{
+	nir_deref_instr *deref_instr =
+		instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ?
+		nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL;
+
+	return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write);
+}
+
+static void get_image_coords(struct ac_nir_context *ctx,
+			     const nir_intrinsic_instr *instr,
+			     struct ac_image_args *args,
+			     enum glsl_sampler_dim dim,
+			     bool is_array)
+{
+	LLVMValueRef src0 = get_src(ctx, instr->src[1]);
+	LLVMValueRef masks[] = {
+		LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
+		LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
+	};
+	LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+
+	int count;
+	ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
+					  dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
+	bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
+		      dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
+	bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
+	assert(!add_frag_pos && "Input attachments should be lowered by this point.");
+	count = image_type_to_components_count(dim, is_array);
+
+	if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load ||
+		      instr->intrinsic == nir_intrinsic_bindless_image_load)) {
+		LLVMValueRef fmask_load_address[3];
+
+		fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
+		fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
+		if (is_array)
+			fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
+		else
+			fmask_load_address[2] = NULL;
+
+		sample_index = adjust_sample_index_using_fmask(&ctx->ac,
+							       fmask_load_address[0],
+							       fmask_load_address[1],
+							       fmask_load_address[2],
+							       sample_index,
+							       get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+										AC_DESC_FMASK, &instr->instr, true, false));
+	}
+	if (count == 1 && !gfx9_1d) {
+		if (instr->src[1].ssa->num_components)
+			args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
+		else
+			args->coords[0] = src0;
+	} else {
+		int chan;
+		if (is_ms)
+			count--;
+		for (chan = 0; chan < count; ++chan) {
+			args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
+		}
+
+		if (gfx9_1d) {
+			if (is_array) {
+				args->coords[2] = args->coords[1];
+				args->coords[1] = ctx->ac.i32_0;
+			} else
+				args->coords[1] = ctx->ac.i32_0;
+			count++;
+		}
+		if (ctx->ac.chip_class == GFX9 &&
+		    dim == GLSL_SAMPLER_DIM_2D &&
+		    !is_array) {
+			/* The hw can't bind a slice of a 3D image as a 2D
+			 * image, because it ignores BASE_ARRAY if the target
+			 * is 3D. The workaround is to read BASE_ARRAY and set
+			 * it as the 3rd address operand for all 2D images.
+			 */
+			LLVMValueRef first_layer, const5, mask;
+
+			const5 = LLVMConstInt(ctx->ac.i32, 5, 0);
+			mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0);
+			first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, "");
+			first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, "");
+
+			args->coords[count] = first_layer;
+			count++;
+		}
+
+
+		if (is_ms) {
+			args->coords[count] = sample_index;
+			count++;
+		}
+	}
+}
+
+static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
+                                                const nir_intrinsic_instr *instr,
+						bool write, bool atomic)
+{
+	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
+	if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
+		LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+		LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+		stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+		LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder,
+		                                              LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""),
+		                                              elem_count, stride, "");
+
+		rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count,
+		                              LLVMConstInt(ctx->ac.i32, 2, 0), "");
+	}
+	return rsrc;
+}
+
+static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
+				     const nir_intrinsic_instr *instr,
+				     bool bindless)
+{
+	LLVMValueRef res;
+
+	enum glsl_sampler_dim dim;
+	enum gl_access_qualifier access;
+	bool is_array;
+	if (bindless) {
+		dim = nir_intrinsic_image_dim(instr);
+		access = nir_intrinsic_access(instr);
+		is_array = nir_intrinsic_image_array(instr);
+	} else {
+		const nir_deref_instr *image_deref = get_image_deref(instr);
+		const struct glsl_type *type = image_deref->type;
+		const nir_variable *var = nir_deref_instr_get_variable(image_deref);
+		dim = glsl_get_sampler_dim(type);
+		access = var->data.image.access;
+		is_array = glsl_sampler_type_is_array(type);
+	}
+
+	struct ac_image_args args = {};
+
+	args.cache_policy = get_cache_policy(ctx, access, false, false);
+
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+		unsigned num_channels = util_last_bit(mask);
+		LLVMValueRef rsrc, vindex;
+
+		rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
+		vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
+						 ctx->ac.i32_0, "");
+
+		bool can_speculate = access & ACCESS_CAN_REORDER;
+		res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
+						  ctx->ac.i32_0, num_channels,
+						  args.cache_policy,
+						  can_speculate);
+		res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
+
+		res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
+		res = ac_to_integer(&ctx->ac, res);
+	} else {
+		args.opcode = ac_image_load;
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+		args.dmask = 15;
+		args.attributes = AC_FUNC_ATTR_READONLY;
+
+		res = ac_build_image_opcode(&ctx->ac, &args);
+	}
+	return res;
+}
+
+static void visit_image_store(struct ac_nir_context *ctx,
+			      nir_intrinsic_instr *instr,
+			      bool bindless)
+{
+
+
+	enum glsl_sampler_dim dim;
+	enum gl_access_qualifier access;
+	bool is_array;
+	if (bindless) {
+		dim = nir_intrinsic_image_dim(instr);
+		access = nir_intrinsic_access(instr);
+		is_array = nir_intrinsic_image_array(instr);
+	} else {
+		const nir_deref_instr *image_deref = get_image_deref(instr);
+		const struct glsl_type *type = image_deref->type;
+		const nir_variable *var = nir_deref_instr_get_variable(image_deref);
+		dim = glsl_get_sampler_dim(type);
+		access = var->data.image.access;
+		is_array = glsl_sampler_type_is_array(type);
+	}
+
+	bool writeonly_memory = access & ACCESS_NON_READABLE;
+	struct ac_image_args args = {};
+
+	args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
+
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
+		LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
+		unsigned src_channels = ac_get_llvm_num_components(src);
+		LLVMValueRef vindex;
+
+		if (src_channels == 3)
+			src = ac_build_expand_to_vec4(&ctx->ac, src, 3);
+
+		vindex = LLVMBuildExtractElement(ctx->ac.builder,
+						 get_src(ctx, instr->src[1]),
+						 ctx->ac.i32_0, "");
+
+		ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex,
+					     ctx->ac.i32_0, src_channels,
+					     args.cache_policy);
+	} else {
+		args.opcode = ac_image_store;
+		args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
+		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+		args.dmask = 15;
+
+		ac_build_image_opcode(&ctx->ac, &args);
+	}
+
+}
+
+static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
+                                       const nir_intrinsic_instr *instr,
+                                       bool bindless)
+{
+	LLVMValueRef params[7];
+	int param_count = 0;
+
+	bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap ||
+		       instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap;
+	const char *atomic_name;
+	char intrinsic_name[64];
+	enum ac_atomic_op atomic_subop;
+	ASSERTED int length;
+
+	enum glsl_sampler_dim dim;
+	bool is_array;
+	if (bindless) {
+		if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_imin ||
+		    instr->intrinsic == nir_intrinsic_bindless_image_atomic_umin ||
+		    instr->intrinsic == nir_intrinsic_bindless_image_atomic_imax ||
+		    instr->intrinsic == nir_intrinsic_bindless_image_atomic_umax) {
+			const GLenum format = nir_intrinsic_format(instr);
+			assert(format == GL_R32UI || format == GL_R32I);
+		}
+		dim = nir_intrinsic_image_dim(instr);
+		is_array = nir_intrinsic_image_array(instr);
+	} else {
+		const struct glsl_type *type = get_image_deref(instr)->type;
+		dim = glsl_get_sampler_dim(type);
+		is_array = glsl_sampler_type_is_array(type);
+	}
+
+	switch (instr->intrinsic) {
+	case nir_intrinsic_bindless_image_atomic_add:
+	case nir_intrinsic_image_deref_atomic_add:
+		atomic_name = "add";
+		atomic_subop = ac_atomic_add;
+		break;
+	case nir_intrinsic_bindless_image_atomic_imin:
+	case nir_intrinsic_image_deref_atomic_imin:
+		atomic_name = "smin";
+		atomic_subop = ac_atomic_smin;
+		break;
+	case nir_intrinsic_bindless_image_atomic_umin:
+	case nir_intrinsic_image_deref_atomic_umin:
+		atomic_name = "umin";
+		atomic_subop = ac_atomic_umin;
+		break;
+	case nir_intrinsic_bindless_image_atomic_imax:
+	case nir_intrinsic_image_deref_atomic_imax:
+		atomic_name = "smax";
+		atomic_subop = ac_atomic_smax;
+		break;
+	case nir_intrinsic_bindless_image_atomic_umax:
+	case nir_intrinsic_image_deref_atomic_umax:
+		atomic_name = "umax";
+		atomic_subop = ac_atomic_umax;
+		break;
+	case nir_intrinsic_bindless_image_atomic_and:
+	case nir_intrinsic_image_deref_atomic_and:
+		atomic_name = "and";
+		atomic_subop = ac_atomic_and;
+		break;
+	case nir_intrinsic_bindless_image_atomic_or:
+	case nir_intrinsic_image_deref_atomic_or:
+		atomic_name = "or";
+		atomic_subop = ac_atomic_or;
+		break;
+	case nir_intrinsic_bindless_image_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_xor:
+		atomic_name = "xor";
+		atomic_subop = ac_atomic_xor;
+		break;
+	case nir_intrinsic_bindless_image_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_exchange:
+		atomic_name = "swap";
+		atomic_subop = ac_atomic_swap;
+		break;
+	case nir_intrinsic_bindless_image_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		atomic_name = "cmpswap";
+		atomic_subop = 0; /* not used */
+		break;
+	case nir_intrinsic_bindless_image_atomic_inc_wrap:
+	case nir_intrinsic_image_deref_atomic_inc_wrap: {
+		atomic_name = "inc";
+		atomic_subop = ac_atomic_inc_wrap;
+		/* ATOMIC_INC instruction does:
+		 *      value = (value + 1) % (data + 1)
+		 * but we want:
+		 *      value = (value + 1) % data
+		 * So replace 'data' by 'data - 1'.
+		 */
+		ctx->ssa_defs[instr->src[3].ssa->index] =
+			LLVMBuildSub(ctx->ac.builder,
+				     ctx->ssa_defs[instr->src[3].ssa->index],
+				     ctx->ac.i32_1, "");
+		break;
+	}
+	case nir_intrinsic_bindless_image_atomic_dec_wrap:
+	case nir_intrinsic_image_deref_atomic_dec_wrap:
+		atomic_name = "dec";
+		atomic_subop = ac_atomic_dec_wrap;
+		break;
+	default:
+		abort();
+	}
+
+	if (cmpswap)
+		params[param_count++] = get_src(ctx, instr->src[4]);
+	params[param_count++] = get_src(ctx, instr->src[3]);
+
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
+		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
+								ctx->ac.i32_0, ""); /* vindex */
+		params[param_count++] = ctx->ac.i32_0; /* voffset */
+		if (LLVM_VERSION_MAJOR >= 9) {
+			/* XXX: The new raw/struct atomic intrinsics are buggy
+			 * with LLVM 8, see r358579.
+			 */
+			params[param_count++] = ctx->ac.i32_0; /* soffset */
+			params[param_count++] = ctx->ac.i32_0;  /* slc */
+
+			length = snprintf(intrinsic_name, sizeof(intrinsic_name),
+			                  "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name);
+		} else {
+			params[param_count++] = ctx->ac.i1false;  /* slc */
+
+			length = snprintf(intrinsic_name, sizeof(intrinsic_name),
+			                  "llvm.amdgcn.buffer.atomic.%s", atomic_name);
+		}
+
+		assert(length < sizeof(intrinsic_name));
+		return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
+					  params, param_count, 0);
+	} else {
+		struct ac_image_args args = {};
+		args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic;
+		args.atomic = atomic_subop;
+		args.data[0] = params[0];
+		if (cmpswap)
+			args.data[1] = params[1];
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
+		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+
+		return ac_build_image_opcode(&ctx->ac, &args);
+	}
+}
+
+static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
+					const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+
+	return ac_build_image_get_sample_count(&ctx->ac, rsrc);
+}
+
+static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
+				     const nir_intrinsic_instr *instr,
+				     bool bindless)
+{
+	LLVMValueRef res;
+
+	enum glsl_sampler_dim dim;
+	bool is_array;
+	if (bindless) {
+		dim = nir_intrinsic_image_dim(instr);
+		is_array = nir_intrinsic_image_array(instr);
+	} else {
+		const struct glsl_type *type = get_image_deref(instr)->type;
+		dim = glsl_get_sampler_dim(type);
+		is_array = glsl_sampler_type_is_array(type);
+	}
+
+	if (dim == GLSL_SAMPLER_DIM_BUF)
+		return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
+
+	struct ac_image_args args = { 0 };
+
+	args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+	args.dmask = 0xf;
+	args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+	args.opcode = ac_image_get_resinfo;
+	args.lod = ctx->ac.i32_0;
+	args.attributes = AC_FUNC_ATTR_READNONE;
+
+	res = ac_build_image_opcode(&ctx->ac, &args);
+
+	LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+
+	if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
+		LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+		LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+		z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+		res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
+	}
+	if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
+		LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+		res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
+						ctx->ac.i32_1, "");
+
+	}
+	return res;
+}
+
+static void emit_membar(struct ac_llvm_context *ac,
+			const nir_intrinsic_instr *instr)
+{
+	unsigned wait_flags = 0;
+
+	switch (instr->intrinsic) {
+	case nir_intrinsic_memory_barrier:
+	case nir_intrinsic_group_memory_barrier:
+		wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+		break;
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+	case nir_intrinsic_memory_barrier_image:
+		wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+		break;
+	case nir_intrinsic_memory_barrier_shared:
+		wait_flags = AC_WAIT_LGKM;
+		break;
+	default:
+		break;
+	}
+
+	ac_build_waitcnt(ac, wait_flags);
+}
+
+void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
+{
+	/* GFX6 only (thanks to a hw bug workaround):
+	 * The real barrier instruction isn’t needed, because an entire patch
+	 * always fits into a single wave.
+	 */
+	if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) {
+		ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
+		return;
+	}
+	ac_build_s_barrier(ac);
+}
+
+static void emit_discard(struct ac_nir_context *ctx,
+			 const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef cond;
+
+	if (instr->intrinsic == nir_intrinsic_discard_if) {
+		cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
+				     get_src(ctx, instr->src[0]),
+				     ctx->ac.i32_0, "");
+	} else {
+		assert(instr->intrinsic == nir_intrinsic_discard);
+		cond = ctx->ac.i1false;
+	}
+
+	ctx->abi->emit_kill(ctx->abi, cond);
+}
+
+static LLVMValueRef
+visit_load_local_invocation_index(struct ac_nir_context *ctx)
+{
+	LLVMValueRef result;
+	LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
+	result = LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size,
+			      LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
+
+	return LLVMBuildAdd(ctx->ac.builder, result, thread_id, "");
+}
+
+static LLVMValueRef
+visit_load_subgroup_id(struct ac_nir_context *ctx)
+{
+	if (ctx->stage == MESA_SHADER_COMPUTE) {
+		LLVMValueRef result;
+		result = LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size,
+				LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
+		return LLVMBuildLShr(ctx->ac.builder, result,  LLVMConstInt(ctx->ac.i32, 6, false), "");
+	} else {
+		return LLVMConstInt(ctx->ac.i32, 0, false);
+	}
+}
+
+static LLVMValueRef
+visit_load_num_subgroups(struct ac_nir_context *ctx)
+{
+	if (ctx->stage == MESA_SHADER_COMPUTE) {
+		return LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size,
+		                    LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
+	} else {
+		return LLVMConstInt(ctx->ac.i32, 1, false);
+	}
+}
+
+static LLVMValueRef
+visit_first_invocation(struct ac_nir_context *ctx)
+{
+	LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1);
+	const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64";
+
+	/* The second argument is whether cttz(0) should be defined, but we do not care. */
+	LLVMValueRef args[] = {active_set, ctx->ac.i1false};
+	LLVMValueRef result =  ac_build_intrinsic(&ctx->ac, intr,
+	                                          ctx->ac.iN_wavemask, args, 2,
+	                                          AC_FUNC_ATTR_NOUNWIND |
+	                                          AC_FUNC_ATTR_READNONE);
+
+	return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, "");
+}
+
+static LLVMValueRef
+visit_load_shared(struct ac_nir_context *ctx,
+		   const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef values[4], derived_ptr, index, ret;
+
+	LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0]);
+
+	for (int chan = 0; chan < instr->num_components; chan++) {
+		index = LLVMConstInt(ctx->ac.i32, chan, 0);
+		derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
+		values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
+	}
+
+	ret = ac_build_gather_values(&ctx->ac, values, instr->num_components);
+	return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
+}
+
+static void
+visit_store_shared(struct ac_nir_context *ctx,
+		   const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef derived_ptr, data,index;
+	LLVMBuilderRef builder = ctx->ac.builder;
+
+	LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1]);
+	LLVMValueRef src = get_src(ctx, instr->src[0]);
+
+	int writemask = nir_intrinsic_write_mask(instr);
+	for (int chan = 0; chan < 4; chan++) {
+		if (!(writemask & (1 << chan))) {
+			continue;
+		}
+		data = ac_llvm_extract_elem(&ctx->ac, src, chan);
+		index = LLVMConstInt(ctx->ac.i32, chan, 0);
+		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+		LLVMBuildStore(builder, data, derived_ptr);
+	}
+}
+
+static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
+				     const nir_intrinsic_instr *instr,
+				     LLVMValueRef ptr, int src_idx)
+{
+	LLVMValueRef result;
+	LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
+
+	const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
+
+	if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap ||
+	    instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) {
+		LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
+		result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope);
+		result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
+	} else {
+		LLVMAtomicRMWBinOp op;
+		switch (instr->intrinsic) {
+		case nir_intrinsic_shared_atomic_add:
+		case nir_intrinsic_deref_atomic_add:
+			op = LLVMAtomicRMWBinOpAdd;
+			break;
+		case nir_intrinsic_shared_atomic_umin:
+		case nir_intrinsic_deref_atomic_umin:
+			op = LLVMAtomicRMWBinOpUMin;
+			break;
+		case nir_intrinsic_shared_atomic_umax:
+		case nir_intrinsic_deref_atomic_umax:
+			op = LLVMAtomicRMWBinOpUMax;
+			break;
+		case nir_intrinsic_shared_atomic_imin:
+		case nir_intrinsic_deref_atomic_imin:
+			op = LLVMAtomicRMWBinOpMin;
+			break;
+		case nir_intrinsic_shared_atomic_imax:
+		case nir_intrinsic_deref_atomic_imax:
+			op = LLVMAtomicRMWBinOpMax;
+			break;
+		case nir_intrinsic_shared_atomic_and:
+		case nir_intrinsic_deref_atomic_and:
+			op = LLVMAtomicRMWBinOpAnd;
+			break;
+		case nir_intrinsic_shared_atomic_or:
+		case nir_intrinsic_deref_atomic_or:
+			op = LLVMAtomicRMWBinOpOr;
+			break;
+		case nir_intrinsic_shared_atomic_xor:
+		case nir_intrinsic_deref_atomic_xor:
+			op = LLVMAtomicRMWBinOpXor;
+			break;
+		case nir_intrinsic_shared_atomic_exchange:
+		case nir_intrinsic_deref_atomic_exchange:
+			op = LLVMAtomicRMWBinOpXchg;
+			break;
+		default:
+			return NULL;
+		}
+
+		result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope);
+	}
+	return result;
+}
+
+static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx)
+{
+	LLVMValueRef values[2];
+	LLVMValueRef pos[2];
+
+	pos[0] = ac_to_float(&ctx->ac, ctx->abi->frag_pos[0]);
+	pos[1] = ac_to_float(&ctx->ac, ctx->abi->frag_pos[1]);
+
+	values[0] = ac_build_fract(&ctx->ac, pos[0], 32);
+	values[1] = ac_build_fract(&ctx->ac, pos[1], 32);
+	return ac_build_gather_values(&ctx->ac, values, 2);
+}
+
+static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx,
+					enum glsl_interp_mode interp, unsigned location)
+{
+	switch (interp) {
+	case INTERP_MODE_FLAT:
+	default:
+		return NULL;
+	case INTERP_MODE_SMOOTH:
+	case INTERP_MODE_NONE:
+		if (location == INTERP_CENTER)
+			return ctx->abi->persp_center;
+		else if (location == INTERP_CENTROID)
+			return ctx->abi->persp_centroid;
+		else if (location == INTERP_SAMPLE)
+			return ctx->abi->persp_sample;
+		break;
+	case INTERP_MODE_NOPERSPECTIVE:
+		if (location == INTERP_CENTER)
+			return ctx->abi->linear_center;
+		else if (location == INTERP_CENTROID)
+			return ctx->abi->linear_centroid;
+		else if (location == INTERP_SAMPLE)
+			return ctx->abi->linear_sample;
+		break;
+	}
+	return NULL;
+}
+
+static LLVMValueRef barycentric_center(struct ac_nir_context *ctx,
+				       unsigned mode)
+{
+	LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
+	return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
+}
+
+static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx,
+				       unsigned mode,
+				       LLVMValueRef offset)
+{
+	LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
+	LLVMValueRef src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, ""));
+	LLVMValueRef src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, ""));
+
+	LLVMValueRef ij_out[2];
+	LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param);
+
+	/*
+	 * take the I then J parameters, and the DDX/Y for it, and
+	 * calculate the IJ inputs for the interpolator.
+	 * temp1 = ddx * offset/sample.x + I;
+	 * interp_param.I = ddy * offset/sample.y + temp1;
+	 * temp1 = ddx * offset/sample.x + J;
+	 * interp_param.J = ddy * offset/sample.y + temp1;
+	 */
+	for (unsigned i = 0; i < 2; i++) {
+		LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false);
+		LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false);
+		LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
+							      ddxy_out, ix_ll, "");
+		LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
+							      ddxy_out, iy_ll, "");
+		LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
+								 interp_param, ix_ll, "");
+		LLVMValueRef temp1, temp2;
+
+		interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el,
+					     ctx->ac.f32, "");
+
+		temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el);
+		temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1);
+
+		ij_out[i] = LLVMBuildBitCast(ctx->ac.builder,
+					     temp2, ctx->ac.i32, "");
+	}
+	interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
+	return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
+}
+
+static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx,
+					 unsigned mode)
+{
+	LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID);
+	return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
+}
+
+static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx,
+					  unsigned mode,
+					  LLVMValueRef sample_id)
+{
+	if (ctx->abi->interp_at_sample_force_center)
+		return barycentric_center(ctx, mode);
+
+	LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
+
+	/* fetch sample ID */
+	LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id);
+
+	LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, "");
+	src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, "");
+	LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, "");
+	src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
+	LLVMValueRef coords[] = { src_c0, src_c1 };
+	LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2);
+
+	return barycentric_offset(ctx, mode, offset);
+}
+
+
+static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx,
+				       unsigned mode)
+{
+	LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE);
+	return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
+}
+
+static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
+					    LLVMValueRef interp_param,
+					    unsigned index, unsigned comp_start,
+					    unsigned num_components,
+					    unsigned bitsize)
+{
+	LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+
+	interp_param = LLVMBuildBitCast(ctx->ac.builder,
+				interp_param, ctx->ac.v2f32, "");
+	LLVMValueRef i = LLVMBuildExtractElement(
+		ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+	LLVMValueRef j = LLVMBuildExtractElement(
+		ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+
+	LLVMValueRef values[4];
+	assert(bitsize == 16 || bitsize == 32);
+	for (unsigned comp = 0; comp < num_components; comp++) {
+		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false);
+		if (bitsize == 16) {
+			values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number,
+							      ctx->abi->prim_mask, i, j);
+		} else {
+			values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number,
+							  ctx->abi->prim_mask, i, j);
+		}
+	}
+
+	return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
+}
+
+static LLVMValueRef load_flat_input(struct ac_nir_context *ctx,
+				    unsigned index, unsigned comp_start,
+				    unsigned num_components,
+				    unsigned bit_size)
+{
+	LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+
+	LLVMValueRef values[8];
+
+	/* Each component of a 64-bit value takes up two GL-level channels. */
+	unsigned channels =
+		bit_size == 64 ? num_components * 2 : num_components;
+
+	for (unsigned chan = 0; chan < channels; chan++) {
+		if (comp_start + chan > 4)
+			attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false);
+		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (comp_start + chan) % 4, false);
+		values[chan] = ac_build_fs_interp_mov(&ctx->ac,
+						      LLVMConstInt(ctx->ac.i32, 2, false),
+						      llvm_chan,
+						      attr_number,
+						      ctx->abi->prim_mask);
+		values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
+		values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan],
+						       bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, "");
+	}
+
+	LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, channels);
+	if (bit_size == 64) {
+		LLVMTypeRef type = num_components == 1 ? ctx->ac.i64 :
+			LLVMVectorType(ctx->ac.i64, num_components);
+		result = LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+	}
+	return result;
+}
+
+static void visit_intrinsic(struct ac_nir_context *ctx,
+                            nir_intrinsic_instr *instr)
+{
+	LLVMValueRef result = NULL;
+
+	switch (instr->intrinsic) {
+	case nir_intrinsic_ballot:
+		result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
+		if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size)
+			result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, "");
+		break;
+	case nir_intrinsic_read_invocation:
+		result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]),
+				get_src(ctx, instr->src[1]));
+		break;
+	case nir_intrinsic_read_first_invocation:
+		result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL);
+		break;
+	case nir_intrinsic_load_subgroup_invocation:
+		result = ac_get_thread_id(&ctx->ac);
+		break;
+	case nir_intrinsic_load_work_group_id: {
+		LLVMValueRef values[3];
+
+		for (int i = 0; i < 3; i++) {
+			values[i] = ctx->abi->workgroup_ids[i] ?
+				    ctx->abi->workgroup_ids[i] : ctx->ac.i32_0;
+		}
+
+		result = ac_build_gather_values(&ctx->ac, values, 3);
+		break;
+	}
+	case nir_intrinsic_load_base_vertex:
+	case nir_intrinsic_load_first_vertex:
+		result = ctx->abi->load_base_vertex(ctx->abi);
+		break;
+	case nir_intrinsic_load_local_group_size:
+		result = ctx->abi->load_local_group_size(ctx->abi);
+		break;
+	case nir_intrinsic_load_vertex_id:
+		result = LLVMBuildAdd(ctx->ac.builder, ctx->abi->vertex_id,
+				      ctx->abi->base_vertex, "");
+		break;
+	case nir_intrinsic_load_vertex_id_zero_base: {
+		result = ctx->abi->vertex_id;
+		break;
+	}
+	case nir_intrinsic_load_local_invocation_id: {
+		result = ctx->abi->local_invocation_ids;
+		break;
+	}
+	case nir_intrinsic_load_base_instance:
+		result = ctx->abi->start_instance;
+		break;
+	case nir_intrinsic_load_draw_id:
+		result = ctx->abi->draw_id;
+		break;
+	case nir_intrinsic_load_view_index:
+		result = ctx->abi->view_index;
+		break;
+	case nir_intrinsic_load_invocation_id:
+		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+			result = ac_unpack_param(&ctx->ac, ctx->abi->tcs_rel_ids, 8, 5);
+		} else {
+			if (ctx->ac.chip_class >= GFX10) {
+				result = LLVMBuildAnd(ctx->ac.builder,
+						      ctx->abi->gs_invocation_id,
+						      LLVMConstInt(ctx->ac.i32, 127, 0), "");
+			} else {
+				result = ctx->abi->gs_invocation_id;
+			}
+		}
+		break;
+	case nir_intrinsic_load_primitive_id:
+		if (ctx->stage == MESA_SHADER_GEOMETRY) {
+			result = ctx->abi->gs_prim_id;
+		} else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+			result = ctx->abi->tcs_patch_id;
+		} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
+			result = ctx->abi->tes_patch_id;
+		} else
+			fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
+		break;
+	case nir_intrinsic_load_sample_id:
+		result = ac_unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4);
+		break;
+	case nir_intrinsic_load_sample_pos:
+		result = load_sample_pos(ctx);
+		break;
+	case nir_intrinsic_load_sample_mask_in:
+		result = ctx->abi->load_sample_mask_in(ctx->abi);
+		break;
+	case nir_intrinsic_load_frag_coord: {
+		LLVMValueRef values[4] = {
+			ctx->abi->frag_pos[0],
+			ctx->abi->frag_pos[1],
+			ctx->abi->frag_pos[2],
+			ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3])
+		};
+		result = ac_to_integer(&ctx->ac,
+		                       ac_build_gather_values(&ctx->ac, values, 4));
+		break;
+	}
+	case nir_intrinsic_load_layer_id:
+		result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
+		break;
+	case nir_intrinsic_load_front_face:
+		result = ctx->abi->front_face;
+		break;
+	case nir_intrinsic_load_helper_invocation:
+		result = ac_build_load_helper_invocation(&ctx->ac);
+		break;
+	case nir_intrinsic_load_color0:
+		result = ctx->abi->color0;
+		break;
+	case nir_intrinsic_load_color1:
+		result = ctx->abi->color1;
+		break;
+	case nir_intrinsic_load_user_data_amd:
+		assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32);
+		result = ctx->abi->user_data;
+		break;
+	case nir_intrinsic_load_instance_id:
+		result = ctx->abi->instance_id;
+		break;
+	case nir_intrinsic_load_num_work_groups:
+		result = ctx->abi->num_work_groups;
+		break;
+	case nir_intrinsic_load_local_invocation_index:
+		result = visit_load_local_invocation_index(ctx);
+		break;
+	case nir_intrinsic_load_subgroup_id:
+		result = visit_load_subgroup_id(ctx);
+		break;
+	case nir_intrinsic_load_num_subgroups:
+		result = visit_load_num_subgroups(ctx);
+		break;
+	case nir_intrinsic_first_invocation:
+		result = visit_first_invocation(ctx);
+		break;
+	case nir_intrinsic_load_push_constant:
+		result = visit_load_push_constant(ctx, instr);
+		break;
+	case nir_intrinsic_vulkan_resource_index: {
+		LLVMValueRef index = get_src(ctx, instr->src[0]);
+		unsigned desc_set = nir_intrinsic_desc_set(instr);
+		unsigned binding = nir_intrinsic_binding(instr);
+
+		result = ctx->abi->load_resource(ctx->abi, index, desc_set,
+						 binding);
+		break;
+	}
+	case nir_intrinsic_vulkan_resource_reindex:
+		result = visit_vulkan_resource_reindex(ctx, instr);
+		break;
+	case nir_intrinsic_store_ssbo:
+		visit_store_ssbo(ctx, instr);
+		break;
+	case nir_intrinsic_load_ssbo:
+		result = visit_load_buffer(ctx, instr);
+		break;
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		result = visit_atomic_ssbo(ctx, instr);
+		break;
+	case nir_intrinsic_load_ubo:
+		result = visit_load_ubo_buffer(ctx, instr);
+		break;
+	case nir_intrinsic_get_buffer_size:
+		result = visit_get_buffer_size(ctx, instr);
+		break;
+	case nir_intrinsic_load_deref:
+		result = visit_load_var(ctx, instr);
+		break;
+	case nir_intrinsic_store_deref:
+		visit_store_var(ctx, instr);
+		break;
+	case nir_intrinsic_load_shared:
+		result = visit_load_shared(ctx, instr);
+		break;
+	case nir_intrinsic_store_shared:
+		visit_store_shared(ctx, instr);
+		break;
+	case nir_intrinsic_bindless_image_samples:
+	case nir_intrinsic_image_deref_samples:
+		result = visit_image_samples(ctx, instr);
+		break;
+	case nir_intrinsic_bindless_image_load:
+		result = visit_image_load(ctx, instr, true);
+		break;
+	case nir_intrinsic_image_deref_load:
+		result = visit_image_load(ctx, instr, false);
+		break;
+	case nir_intrinsic_bindless_image_store:
+		visit_image_store(ctx, instr, true);
+		break;
+	case nir_intrinsic_image_deref_store:
+		visit_image_store(ctx, instr, false);
+		break;
+	case nir_intrinsic_bindless_image_atomic_add:
+	case nir_intrinsic_bindless_image_atomic_imin:
+	case nir_intrinsic_bindless_image_atomic_umin:
+	case nir_intrinsic_bindless_image_atomic_imax:
+	case nir_intrinsic_bindless_image_atomic_umax:
+	case nir_intrinsic_bindless_image_atomic_and:
+	case nir_intrinsic_bindless_image_atomic_or:
+	case nir_intrinsic_bindless_image_atomic_xor:
+	case nir_intrinsic_bindless_image_atomic_exchange:
+	case nir_intrinsic_bindless_image_atomic_comp_swap:
+	case nir_intrinsic_bindless_image_atomic_inc_wrap:
+	case nir_intrinsic_bindless_image_atomic_dec_wrap:
+		result = visit_image_atomic(ctx, instr, true);
+		break;
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_imin:
+	case nir_intrinsic_image_deref_atomic_umin:
+	case nir_intrinsic_image_deref_atomic_imax:
+	case nir_intrinsic_image_deref_atomic_umax:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_inc_wrap:
+	case nir_intrinsic_image_deref_atomic_dec_wrap:
+		result = visit_image_atomic(ctx, instr, false);
+		break;
+	case nir_intrinsic_bindless_image_size:
+		result = visit_image_size(ctx, instr, true);
+		break;
+	case nir_intrinsic_image_deref_size:
+		result = visit_image_size(ctx, instr, false);
+		break;
+	case nir_intrinsic_shader_clock:
+		result = ac_build_shader_clock(&ctx->ac);
+		break;
+	case nir_intrinsic_discard:
+	case nir_intrinsic_discard_if:
+		emit_discard(ctx, instr);
+		break;
+	case nir_intrinsic_memory_barrier:
+	case nir_intrinsic_group_memory_barrier:
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+	case nir_intrinsic_memory_barrier_image:
+	case nir_intrinsic_memory_barrier_shared:
+		emit_membar(&ctx->ac, instr);
+		break;
+	case nir_intrinsic_barrier:
+		ac_emit_barrier(&ctx->ac, ctx->stage);
+		break;
+	case nir_intrinsic_shared_atomic_add:
+	case nir_intrinsic_shared_atomic_imin:
+	case nir_intrinsic_shared_atomic_umin:
+	case nir_intrinsic_shared_atomic_imax:
+	case nir_intrinsic_shared_atomic_umax:
+	case nir_intrinsic_shared_atomic_and:
+	case nir_intrinsic_shared_atomic_or:
+	case nir_intrinsic_shared_atomic_xor:
+	case nir_intrinsic_shared_atomic_exchange:
+	case nir_intrinsic_shared_atomic_comp_swap: {
+		LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0]);
+		result = visit_var_atomic(ctx, instr, ptr, 1);
+		break;
+	}
+	case nir_intrinsic_deref_atomic_add:
+	case nir_intrinsic_deref_atomic_imin:
+	case nir_intrinsic_deref_atomic_umin:
+	case nir_intrinsic_deref_atomic_imax:
+	case nir_intrinsic_deref_atomic_umax:
+	case nir_intrinsic_deref_atomic_and:
+	case nir_intrinsic_deref_atomic_or:
+	case nir_intrinsic_deref_atomic_xor:
+	case nir_intrinsic_deref_atomic_exchange:
+	case nir_intrinsic_deref_atomic_comp_swap: {
+		LLVMValueRef ptr = get_src(ctx, instr->src[0]);
+		result = visit_var_atomic(ctx, instr, ptr, 1);
+		break;
+	}
+	case nir_intrinsic_load_barycentric_pixel:
+		result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr));
+		break;
+	case nir_intrinsic_load_barycentric_centroid:
+		result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr));
+		break;
+	case nir_intrinsic_load_barycentric_sample:
+		result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr));
+		break;
+	case nir_intrinsic_load_barycentric_at_offset: {
+		LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
+		result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset);
+		break;
+	}
+	case nir_intrinsic_load_barycentric_at_sample: {
+		LLVMValueRef sample_id = get_src(ctx, instr->src[0]);
+		result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id);
+		break;
+	}
+	case nir_intrinsic_load_interpolated_input: {
+		/* We assume any indirect loads have been lowered away */
+		ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]);
+		assert(offset);
+		assert(offset[0].i32 == 0);
+
+		LLVMValueRef interp_param = get_src(ctx, instr->src[0]);
+		unsigned index = nir_intrinsic_base(instr);
+		unsigned component = nir_intrinsic_component(instr);
+		result = load_interpolated_input(ctx, interp_param, index,
+						 component,
+						 instr->dest.ssa.num_components,
+						 instr->dest.ssa.bit_size);
+		break;
+	}
+	case nir_intrinsic_load_input: {
+		/* We only lower inputs for fragment shaders ATM */
+		ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[0]);
+		assert(offset);
+		assert(offset[0].i32 == 0);
+
+		unsigned index = nir_intrinsic_base(instr);
+		unsigned component = nir_intrinsic_component(instr);
+		result = load_flat_input(ctx, index, component,
+					 instr->dest.ssa.num_components,
+					 instr->dest.ssa.bit_size);
+		break;
+	}
+	case nir_intrinsic_emit_vertex:
+		ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
+		break;
+	case nir_intrinsic_end_primitive:
+		ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
+		break;
+	case nir_intrinsic_load_tess_coord:
+		result = ctx->abi->load_tess_coord(ctx->abi);
+		break;
+	case nir_intrinsic_load_tess_level_outer:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false);
+		break;
+	case nir_intrinsic_load_tess_level_inner:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false);
+		break;
+	case nir_intrinsic_load_tess_level_outer_default:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true);
+		break;
+	case nir_intrinsic_load_tess_level_inner_default:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true);
+		break;
+	case nir_intrinsic_load_patch_vertices_in:
+		result = ctx->abi->load_patch_vertices_in(ctx->abi);
+		break;
+	case nir_intrinsic_vote_all: {
+		LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0]));
+		result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
+		break;
+	}
+	case nir_intrinsic_vote_any: {
+		LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0]));
+		result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
+		break;
+	}
+	case nir_intrinsic_shuffle:
+		result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+				get_src(ctx, instr->src[1]));
+		break;
+	case nir_intrinsic_reduce:
+		result = ac_build_reduce(&ctx->ac,
+				get_src(ctx, instr->src[0]),
+				instr->const_index[0],
+				instr->const_index[1]);
+		break;
+	case nir_intrinsic_inclusive_scan:
+		result = ac_build_inclusive_scan(&ctx->ac,
+				get_src(ctx, instr->src[0]),
+				instr->const_index[0]);
+		break;
+	case nir_intrinsic_exclusive_scan:
+		result = ac_build_exclusive_scan(&ctx->ac,
+				get_src(ctx, instr->src[0]),
+				instr->const_index[0]);
+		break;
+	case nir_intrinsic_quad_broadcast: {
+		unsigned lane = nir_src_as_uint(instr->src[1]);
+		result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]),
+				lane, lane, lane, lane);
+		break;
+	}
+	case nir_intrinsic_quad_swap_horizontal:
+		result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3 ,2);
+		break;
+	case nir_intrinsic_quad_swap_vertical:
+		result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0 ,1);
+		break;
+	case nir_intrinsic_quad_swap_diagonal:
+		result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1 ,0);
+		break;
+	case nir_intrinsic_quad_swizzle_amd: {
+		uint32_t mask = nir_intrinsic_swizzle_mask(instr);
+		result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]),
+					       mask & 0x3, (mask >> 2) & 0x3,
+					       (mask >> 4) & 0x3, (mask >> 6) & 0x3);
+		break;
+	}
+	case nir_intrinsic_masked_swizzle_amd: {
+		uint32_t mask = nir_intrinsic_swizzle_mask(instr);
+		result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask);
+		break;
+	}
+	case nir_intrinsic_write_invocation_amd:
+		result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]),
+					    get_src(ctx, instr->src[1]),
+					    get_src(ctx, instr->src[2]));
+		break;
+	case nir_intrinsic_mbcnt_amd:
+		result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0]));
+		break;
+	case nir_intrinsic_load_scratch: {
+		LLVMValueRef offset = get_src(ctx, instr->src[0]);
+		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch,
+						 offset);
+		LLVMTypeRef comp_type =
+			LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
+		LLVMTypeRef vec_type =
+			instr->dest.ssa.num_components == 1 ? comp_type :
+			LLVMVectorType(comp_type, instr->dest.ssa.num_components);
+		unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+		ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
+				       LLVMPointerType(vec_type, addr_space), "");
+		result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+		break;
+	}
+	case nir_intrinsic_store_scratch: {
+		LLVMValueRef offset = get_src(ctx, instr->src[1]);
+		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch,
+						 offset);
+		LLVMTypeRef comp_type =
+			LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size);
+		unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+		ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
+				       LLVMPointerType(comp_type, addr_space), "");
+		LLVMValueRef src = get_src(ctx, instr->src[0]);
+		unsigned wrmask = nir_intrinsic_write_mask(instr);
+		while (wrmask) {
+			int start, count;
+			u_bit_scan_consecutive_range(&wrmask, &start, &count);
+			
+			LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false);
+			LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, "");
+			LLVMTypeRef vec_type =
+				count == 1 ? comp_type : LLVMVectorType(comp_type, count);
+			offset_ptr = LLVMBuildBitCast(ctx->ac.builder,
+						      offset_ptr,
+						      LLVMPointerType(vec_type, addr_space),
+						      "");
+			LLVMValueRef offset_src =
+				ac_extract_components(&ctx->ac, src, start, count);
+			LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr);
+		}
+		break;
+	}
+	case nir_intrinsic_load_constant: {
+		LLVMValueRef offset = get_src(ctx, instr->src[0]);
+		LLVMValueRef base = LLVMConstInt(ctx->ac.i32,
+						 nir_intrinsic_base(instr),
+						 false);
+		offset = LLVMBuildAdd(ctx->ac.builder, offset, base, "");
+		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data,
+						 offset);
+		LLVMTypeRef comp_type =
+			LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
+		LLVMTypeRef vec_type =
+			instr->dest.ssa.num_components == 1 ? comp_type :
+			LLVMVectorType(comp_type, instr->dest.ssa.num_components);
+		unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+		ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
+				       LLVMPointerType(vec_type, addr_space), "");
+		result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+		break;
+	}
+	default:
+		fprintf(stderr, "Unknown intrinsic: ");
+		nir_print_instr(&instr->instr, stderr);
+		fprintf(stderr, "\n");
+		break;
+	}
+	if (result) {
+		ctx->ssa_defs[instr->dest.ssa.index] = result;
+	}
+}
+
+static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx,
+						    unsigned base_index,
+						    unsigned constant_index,
+						    LLVMValueRef dynamic_index)
+{
+	LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0);
+	LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
+					  LLVMConstInt(ctx->ac.i32, constant_index, 0), "");
+
+	/* Bindless uniforms are 64bit so multiple index by 8 */
+	index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), "");
+	offset = LLVMBuildAdd(ctx->ac.builder, offset, index, "");
+
+	LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0);
+
+	LLVMValueRef ret = ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset,
+						NULL, 0, 0, true, true);
+
+	return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
+}
+
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+				     nir_deref_instr *deref_instr,
+				     enum ac_descriptor_type desc_type,
+				     const nir_instr *instr,
+				     bool image, bool write)
+{
+	LLVMValueRef index = NULL;
+	unsigned constant_index = 0;
+	unsigned descriptor_set;
+	unsigned base_index;
+	bool bindless = false;
+
+	if (!deref_instr) {
+		descriptor_set = 0;
+		if (image) {
+			nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr);
+			base_index = 0;
+			bindless = true;
+			index = get_src(ctx, img_instr->src[0]);
+		} else {
+			nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
+			int sampSrcIdx = nir_tex_instr_src_index(tex_instr,
+								 nir_tex_src_sampler_handle);
+			if (sampSrcIdx != -1) {
+				base_index = 0;
+				bindless = true;
+				index = get_src(ctx, tex_instr->src[sampSrcIdx].src);
+			} else {
+				assert(tex_instr && !image);
+				base_index = tex_instr->sampler_index;
+			}
+		}
+	} else {
+		while(deref_instr->deref_type != nir_deref_type_var) {
+			if (deref_instr->deref_type == nir_deref_type_array) {
+				unsigned array_size = glsl_get_aoa_size(deref_instr->type);
+				if (!array_size)
+					array_size = 1;
+
+				if (nir_src_is_const(deref_instr->arr.index)) {
+					constant_index += array_size * nir_src_as_uint(deref_instr->arr.index);
+				} else {
+					LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index);
+
+					indirect = LLVMBuildMul(ctx->ac.builder, indirect,
+						LLVMConstInt(ctx->ac.i32, array_size, false), "");
+
+					if (!index)
+						index = indirect;
+					else
+						index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
+				}
+
+				deref_instr = nir_src_as_deref(deref_instr->parent);
+			} else if (deref_instr->deref_type == nir_deref_type_struct) {
+				unsigned sidx = deref_instr->strct.index;
+				deref_instr = nir_src_as_deref(deref_instr->parent);
+				constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx);
+			} else {
+				unreachable("Unsupported deref type");
+			}
+		}
+		descriptor_set = deref_instr->var->data.descriptor_set;
+
+		if (deref_instr->var->data.bindless) {
+			/* For now just assert on unhandled variable types */
+			assert(deref_instr->var->data.mode == nir_var_uniform);
+
+			base_index = deref_instr->var->data.driver_location;
+			bindless = true;
+
+			index = index ? index : ctx->ac.i32_0;
+			index = get_bindless_index_from_uniform(ctx, base_index,
+								constant_index, index);
+		} else
+			base_index = deref_instr->var->data.binding;
+	}
+
+	return ctx->abi->load_sampler_desc(ctx->abi,
+					  descriptor_set,
+					  base_index,
+					  constant_index, index,
+					  desc_type, image, write, bindless);
+}
+
+/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
+ *
+ * GFX6-GFX7:
+ *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
+ *   filtering manually. The driver sets img7 to a mask clearing
+ *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
+ *     s_and_b32 samp0, samp0, img7
+ *
+ * GFX8:
+ *   The ANISO_OVERRIDE sampler field enables this fix in TA.
+ */
+static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
+                                           LLVMValueRef res, LLVMValueRef samp)
+{
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef img7, samp0;
+
+	if (ctx->ac.chip_class >= GFX8)
+		return samp;
+
+	img7 = LLVMBuildExtractElement(builder, res,
+	                               LLVMConstInt(ctx->ac.i32, 7, 0), "");
+	samp0 = LLVMBuildExtractElement(builder, samp,
+	                                LLVMConstInt(ctx->ac.i32, 0, 0), "");
+	samp0 = LLVMBuildAnd(builder, samp0, img7, "");
+	return LLVMBuildInsertElement(builder, samp, samp0,
+	                              LLVMConstInt(ctx->ac.i32, 0, 0), "");
+}
+
+static void tex_fetch_ptrs(struct ac_nir_context *ctx,
+			   nir_tex_instr *instr,
+			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
+			   LLVMValueRef *fmask_ptr)
+{
+	nir_deref_instr *texture_deref_instr = NULL;
+	nir_deref_instr *sampler_deref_instr = NULL;
+	int plane = -1;
+
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			texture_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		case nir_tex_src_sampler_deref:
+			sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		case nir_tex_src_plane:
+			plane = nir_src_as_int(instr->src[i].src);
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!sampler_deref_instr)
+		sampler_deref_instr = texture_deref_instr;
+
+	enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
+
+	if (plane >= 0) {
+		assert(instr->op != nir_texop_txf_ms &&
+		       instr->op != nir_texop_samples_identical);
+		assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
+
+		main_descriptor = AC_DESC_PLANE_0 + plane;
+	}
+
+	*res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false);
+
+	if (samp_ptr) {
+		*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false);
+		if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
+			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
+	}
+	if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
+	                  instr->op == nir_texop_samples_identical))
+		*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false);
+}
+
+static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
+				      LLVMValueRef coord)
+{
+	coord = ac_to_float(ctx, coord);
+	coord = ac_build_round(ctx, coord);
+	coord = ac_to_integer(ctx, coord);
+	return coord;
+}
+
+static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
+{
+	LLVMValueRef result = NULL;
+	struct ac_image_args args = { 0 };
+	LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
+	LLVMValueRef ddx = NULL, ddy = NULL;
+	unsigned offset_src = 0;
+
+	tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr);
+
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_coord: {
+			LLVMValueRef coord = get_src(ctx, instr->src[i].src);
+			for (unsigned chan = 0; chan < instr->coord_components; ++chan)
+				args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
+			break;
+		}
+		case nir_tex_src_projector:
+			break;
+		case nir_tex_src_comparator:
+			if (instr->is_shadow) {
+				args.compare = get_src(ctx, instr->src[i].src);
+				args.compare = ac_to_float(&ctx->ac, args.compare);
+			}
+			break;
+		case nir_tex_src_offset:
+			args.offset = get_src(ctx, instr->src[i].src);
+			offset_src = i;
+			break;
+		case nir_tex_src_bias:
+			if (instr->op == nir_texop_txb)
+				args.bias = get_src(ctx, instr->src[i].src);
+			break;
+		case nir_tex_src_lod: {
+			if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
+				args.level_zero = true;
+			else
+				args.lod = get_src(ctx, instr->src[i].src);
+			break;
+		}
+		case nir_tex_src_ms_index:
+			sample_index = get_src(ctx, instr->src[i].src);
+			break;
+		case nir_tex_src_ms_mcs:
+			break;
+		case nir_tex_src_ddx:
+			ddx = get_src(ctx, instr->src[i].src);
+			break;
+		case nir_tex_src_ddy:
+			ddy = get_src(ctx, instr->src[i].src);
+			break;
+		case nir_tex_src_texture_offset:
+		case nir_tex_src_sampler_offset:
+		case nir_tex_src_plane:
+		default:
+			break;
+		}
+	}
+
+	if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
+		result = get_buffer_size(ctx, args.resource, true);
+		goto write_result;
+	}
+
+	if (instr->op == nir_texop_texture_samples) {
+		LLVMValueRef res, samples, is_msaa;
+		res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
+		samples = LLVMBuildExtractElement(ctx->ac.builder, res,
+						  LLVMConstInt(ctx->ac.i32, 3, false), "");
+		is_msaa = LLVMBuildLShr(ctx->ac.builder, samples,
+					LLVMConstInt(ctx->ac.i32, 28, false), "");
+		is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa,
+				       LLVMConstInt(ctx->ac.i32, 0xe, false), "");
+		is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa,
+					LLVMConstInt(ctx->ac.i32, 0xe, false), "");
+
+		samples = LLVMBuildLShr(ctx->ac.builder, samples,
+					LLVMConstInt(ctx->ac.i32, 16, false), "");
+		samples = LLVMBuildAnd(ctx->ac.builder, samples,
+				       LLVMConstInt(ctx->ac.i32, 0xf, false), "");
+		samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
+				       samples, "");
+		samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
+					  ctx->ac.i32_1, "");
+		result = samples;
+		goto write_result;
+	}
+
+	if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
+		LLVMValueRef offset[3], pack;
+		for (unsigned chan = 0; chan < 3; ++chan)
+			offset[chan] = ctx->ac.i32_0;
+
+		unsigned num_components = ac_get_llvm_num_components(args.offset);
+		for (unsigned chan = 0; chan < num_components; chan++) {
+			offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan);
+			offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
+						    LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
+			if (chan)
+				offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
+							    LLVMConstInt(ctx->ac.i32, chan * 8, false), "");
+		}
+		pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
+		pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
+		args.offset = pack;
+	}
+
+	/* Section 8.23.1 (Depth Texture Comparison Mode) of the
+	 * OpenGL 4.5 spec says:
+	 *
+	 *    "If the texture’s internal format indicates a fixed-point
+	 *     depth texture, then D_t and D_ref are clamped to the
+	 *     range [0, 1]; otherwise no clamping is performed."
+	 *
+	 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
+	 * so the depth comparison value isn't clamped for Z16 and
+	 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
+	 * an explicitly clamped 32-bit float format.
+	 */
+	if (args.compare &&
+	    ctx->ac.chip_class >= GFX8 &&
+	    ctx->ac.chip_class <= GFX9 &&
+	    ctx->abi->clamp_shadow_reference) {
+		LLVMValueRef upgraded, clamped;
+
+		upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
+						   LLVMConstInt(ctx->ac.i32, 3, false), "");
+		upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
+					 LLVMConstInt(ctx->ac.i32, 29, false), "");
+		upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, "");
+		clamped = ac_build_clamp(&ctx->ac, args.compare);
+		args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped,
+					       args.compare, "");
+	}
+
+	/* pack derivatives */
+	if (ddx || ddy) {
+		int num_src_deriv_channels, num_dest_deriv_channels;
+		switch (instr->sampler_dim) {
+		case GLSL_SAMPLER_DIM_3D:
+		case GLSL_SAMPLER_DIM_CUBE:
+			num_src_deriv_channels = 3;
+			num_dest_deriv_channels = 3;
+			break;
+		case GLSL_SAMPLER_DIM_2D:
+		default:
+			num_src_deriv_channels = 2;
+			num_dest_deriv_channels = 2;
+			break;
+		case GLSL_SAMPLER_DIM_1D:
+			num_src_deriv_channels = 1;
+			if (ctx->ac.chip_class == GFX9) {
+				num_dest_deriv_channels = 2;
+			} else {
+				num_dest_deriv_channels = 1;
+			}
+			break;
+		}
+
+		for (unsigned i = 0; i < num_src_deriv_channels; i++) {
+			args.derivs[i] = ac_to_float(&ctx->ac,
+				ac_llvm_extract_elem(&ctx->ac, ddx, i));
+			args.derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac,
+				ac_llvm_extract_elem(&ctx->ac, ddy, i));
+		}
+		for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
+			args.derivs[i] = ctx->ac.f32_0;
+			args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0;
+		}
+	}
+
+	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) {
+		for (unsigned chan = 0; chan < instr->coord_components; chan++)
+			args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]);
+		if (instr->coord_components == 3)
+			args.coords[3] = LLVMGetUndef(ctx->ac.f32);
+		ac_prepare_cube_coords(&ctx->ac,
+			instr->op == nir_texop_txd, instr->is_array,
+			instr->op == nir_texop_lod, args.coords, args.derivs);
+	}
+
+	/* Texture coordinates fixups */
+	if (instr->coord_components > 1 &&
+	    instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+	    instr->is_array &&
+	    instr->op != nir_texop_txf) {
+		args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]);
+	}
+
+	if (instr->coord_components > 2 &&
+	    (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
+	     instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
+	     instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
+	     instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
+	    instr->is_array &&
+	    instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
+		args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
+	}
+
+	if (ctx->ac.chip_class == GFX9 &&
+	    instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+	    instr->op != nir_texop_lod) {
+		LLVMValueRef filler;
+		if (instr->op == nir_texop_txf)
+			filler = ctx->ac.i32_0;
+		else
+			filler = LLVMConstReal(ctx->ac.f32, 0.5);
+
+		if (instr->is_array)
+			args.coords[2] = args.coords[1];
+		args.coords[1] = filler;
+	}
+
+	/* Pack sample index */
+	if (instr->op == nir_texop_txf_ms && sample_index)
+		args.coords[instr->coord_components] = sample_index;
+
+	if (instr->op == nir_texop_samples_identical) {
+		struct ac_image_args txf_args = { 0 };
+		memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords));
+
+		txf_args.dmask = 0xf;
+		txf_args.resource = fmask_ptr;
+		txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d;
+		result = build_tex_intrinsic(ctx, instr, &txf_args);
+
+		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0);
+		goto write_result;
+	}
+
+	if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
+	     instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
+	    instr->op != nir_texop_txs) {
+		unsigned sample_chan = instr->is_array ? 3 : 2;
+		args.coords[sample_chan] = adjust_sample_index_using_fmask(
+			&ctx->ac, args.coords[0], args.coords[1],
+			instr->is_array ? args.coords[2] : NULL,
+			args.coords[sample_chan], fmask_ptr);
+	}
+
+	if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
+		int num_offsets = instr->src[offset_src].src.ssa->num_components;
+		num_offsets = MIN2(num_offsets, instr->coord_components);
+		for (unsigned i = 0; i < num_offsets; ++i) {
+			args.coords[i] = LLVMBuildAdd(
+				ctx->ac.builder, args.coords[i],
+				LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false), "");
+		}
+		args.offset = NULL;
+	}
+
+	/* DMASK was repurposed for GATHER4. 4 components are always
+	 * returned and DMASK works like a swizzle - it selects
+	 * the component to fetch. The only valid DMASK values are
+	 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+	 * (red,red,red,red) etc.) The ISA document doesn't mention
+	 * this.
+	 */
+	args.dmask = 0xf;
+	if (instr->op == nir_texop_tg4) {
+		if (instr->is_shadow)
+			args.dmask = 1;
+		else
+			args.dmask = 1 << instr->component;
+	}
+
+	if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
+		args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array);
+		args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
+	}
+	result = build_tex_intrinsic(ctx, instr, &args);
+
+	if (instr->op == nir_texop_query_levels)
+		result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
+	else if (instr->is_shadow && instr->is_new_style_shadow &&
+		 instr->op != nir_texop_txs && instr->op != nir_texop_lod &&
+		 instr->op != nir_texop_tg4)
+		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+	else if (instr->op == nir_texop_txs &&
+		 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+		 instr->is_array) {
+		LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+		LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+		LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
+		z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+		result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, "");
+	} else if (ctx->ac.chip_class == GFX9 &&
+		   instr->op == nir_texop_txs &&
+		   instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+		   instr->is_array) {
+		LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+		LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
+		result = LLVMBuildInsertElement(ctx->ac.builder, result, layers,
+						ctx->ac.i32_1, "");
+	} else if (instr->dest.ssa.num_components != 4)
+		result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
+
+write_result:
+	if (result) {
+		assert(instr->dest.is_ssa);
+		result = ac_to_integer(&ctx->ac, result);
+		ctx->ssa_defs[instr->dest.ssa.index] = result;
+	}
+}
+
+
+static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
+{
+	LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
+	LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, "");
+
+	ctx->ssa_defs[instr->dest.ssa.index] = result;
+	_mesa_hash_table_insert(ctx->phis, instr, result);
+}
+
+static void visit_post_phi(struct ac_nir_context *ctx,
+                           nir_phi_instr *instr,
+                           LLVMValueRef llvm_phi)
+{
+	nir_foreach_phi_src(src, instr) {
+		LLVMBasicBlockRef block = get_block(ctx, src->pred);
+		LLVMValueRef llvm_src = get_src(ctx, src->src);
+
+		LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
+	}
+}
+
+static void phi_post_pass(struct ac_nir_context *ctx)
+{
+	hash_table_foreach(ctx->phis, entry) {
+		visit_post_phi(ctx, (nir_phi_instr*)entry->key,
+		               (LLVMValueRef)entry->data);
+	}
+}
+
+
+static void visit_ssa_undef(struct ac_nir_context *ctx,
+			    const nir_ssa_undef_instr *instr)
+{
+	unsigned num_components = instr->def.num_components;
+	LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
+	LLVMValueRef undef;
+
+	if (num_components == 1)
+		undef = LLVMGetUndef(type);
+	else {
+		undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+	}
+	ctx->ssa_defs[instr->def.index] = undef;
+}
+
+static void visit_jump(struct ac_llvm_context *ctx,
+		       const nir_jump_instr *instr)
+{
+	switch (instr->type) {
+	case nir_jump_break:
+		ac_build_break(ctx);
+		break;
+	case nir_jump_continue:
+		ac_build_continue(ctx);
+		break;
+	default:
+		fprintf(stderr, "Unknown NIR jump instr: ");
+		nir_print_instr(&instr->instr, stderr);
+		fprintf(stderr, "\n");
+		abort();
+	}
+}
+
+static LLVMTypeRef
+glsl_base_to_llvm_type(struct ac_llvm_context *ac,
+		       enum glsl_base_type type)
+{
+	switch (type) {
+	case GLSL_TYPE_INT:
+	case GLSL_TYPE_UINT:
+	case GLSL_TYPE_BOOL:
+	case GLSL_TYPE_SUBROUTINE:
+		return ac->i32;
+	case GLSL_TYPE_INT8:
+	case GLSL_TYPE_UINT8:
+		return ac->i8;
+	case GLSL_TYPE_INT16:
+	case GLSL_TYPE_UINT16:
+		return ac->i16;
+	case GLSL_TYPE_FLOAT:
+		return ac->f32;
+	case GLSL_TYPE_FLOAT16:
+		return ac->f16;
+	case GLSL_TYPE_INT64:
+	case GLSL_TYPE_UINT64:
+		return ac->i64;
+	case GLSL_TYPE_DOUBLE:
+		return ac->f64;
+	default:
+		unreachable("unknown GLSL type");
+	}
+}
+
+static LLVMTypeRef
+glsl_to_llvm_type(struct ac_llvm_context *ac,
+		  const struct glsl_type *type)
+{
+	if (glsl_type_is_scalar(type)) {
+		return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
+	}
+
+	if (glsl_type_is_vector(type)) {
+		return LLVMVectorType(
+		   glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
+		   glsl_get_vector_elements(type));
+	}
+
+	if (glsl_type_is_matrix(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_column_type(type)),
+		   glsl_get_matrix_columns(type));
+	}
+
+	if (glsl_type_is_array(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_array_element(type)),
+		   glsl_get_length(type));
+	}
+
+	assert(glsl_type_is_struct_or_ifc(type));
+
+	LLVMTypeRef member_types[glsl_get_length(type)];
+
+	for (unsigned i = 0; i < glsl_get_length(type); i++) {
+		member_types[i] =
+			glsl_to_llvm_type(ac,
+					  glsl_get_struct_field(type, i));
+	}
+
+	return LLVMStructTypeInContext(ac->context, member_types,
+				       glsl_get_length(type), false);
+}
+
+static void visit_deref(struct ac_nir_context *ctx,
+                        nir_deref_instr *instr)
+{
+	if (instr->mode != nir_var_mem_shared &&
+	    instr->mode != nir_var_mem_global)
+		return;
+
+	LLVMValueRef result = NULL;
+	switch(instr->deref_type) {
+	case nir_deref_type_var: {
+		struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var);
+		result = entry->data;
+		break;
+	}
+	case nir_deref_type_struct:
+		if (instr->mode == nir_var_mem_global) {
+			nir_deref_instr *parent = nir_deref_instr_parent(instr);
+			uint64_t offset = glsl_get_struct_field_offset(parent->type,
+                                                                       instr->strct.index);
+			result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
+			                       LLVMConstInt(ctx->ac.i32, offset, 0));
+		} else {
+			result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
+			                       LLVMConstInt(ctx->ac.i32, instr->strct.index, 0));
+		}
+		break;
+	case nir_deref_type_array:
+		if (instr->mode == nir_var_mem_global) {
+			nir_deref_instr *parent = nir_deref_instr_parent(instr);
+			unsigned stride = glsl_get_explicit_stride(parent->type);
+
+			if ((glsl_type_is_matrix(parent->type) &&
+			     glsl_matrix_type_is_row_major(parent->type)) ||
+			    (glsl_type_is_vector(parent->type) && stride == 0))
+				stride = type_scalar_size_bytes(parent->type);
+
+			assert(stride > 0);
+			LLVMValueRef index = get_src(ctx, instr->arr.index);
+			if (LLVMTypeOf(index) != ctx->ac.i64)
+				index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
+
+			LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
+
+			result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
+		} else {
+			result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
+			                       get_src(ctx, instr->arr.index));
+		}
+		break;
+	case nir_deref_type_ptr_as_array:
+		if (instr->mode == nir_var_mem_global) {
+			unsigned stride = nir_deref_instr_ptr_as_array_stride(instr);
+
+			LLVMValueRef index = get_src(ctx, instr->arr.index);
+			if (LLVMTypeOf(index) != ctx->ac.i64)
+				index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
+
+			LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
+
+			result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
+		} else {
+			result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
+			                       get_src(ctx, instr->arr.index));
+		}
+		break;
+	case nir_deref_type_cast: {
+		result = get_src(ctx, instr->parent);
+
+		/* We can't use the structs from LLVM because the shader
+		 * specifies its own offsets. */
+		LLVMTypeRef pointee_type = ctx->ac.i8;
+		if (instr->mode == nir_var_mem_shared)
+			pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
+
+		unsigned address_space;
+
+		switch(instr->mode) {
+		case nir_var_mem_shared:
+			address_space = AC_ADDR_SPACE_LDS;
+			break;
+		case nir_var_mem_global:
+			address_space = AC_ADDR_SPACE_GLOBAL;
+			break;
+		default:
+			unreachable("Unhandled address space");
+		}
+
+		LLVMTypeRef type = LLVMPointerType(pointee_type, address_space);
+
+		if (LLVMTypeOf(result) != type) {
+			if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
+				result = LLVMBuildBitCast(ctx->ac.builder, result,
+				                          type, "");
+			} else {
+				result = LLVMBuildIntToPtr(ctx->ac.builder, result,
+				                           type, "");
+			}
+		}
+		break;
+	}
+	default:
+		unreachable("Unhandled deref_instr deref type");
+	}
+
+	ctx->ssa_defs[instr->dest.ssa.index] = result;
+}
+
+static void visit_cf_list(struct ac_nir_context *ctx,
+                          struct exec_list *list);
+
+static void visit_block(struct ac_nir_context *ctx, nir_block *block)
+{
+	nir_foreach_instr(instr, block)
+	{
+		switch (instr->type) {
+		case nir_instr_type_alu:
+			visit_alu(ctx, nir_instr_as_alu(instr));
+			break;
+		case nir_instr_type_load_const:
+			visit_load_const(ctx, nir_instr_as_load_const(instr));
+			break;
+		case nir_instr_type_intrinsic:
+			visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+			break;
+		case nir_instr_type_tex:
+			visit_tex(ctx, nir_instr_as_tex(instr));
+			break;
+		case nir_instr_type_phi:
+			visit_phi(ctx, nir_instr_as_phi(instr));
+			break;
+		case nir_instr_type_ssa_undef:
+			visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
+			break;
+		case nir_instr_type_jump:
+			visit_jump(&ctx->ac, nir_instr_as_jump(instr));
+			break;
+		case nir_instr_type_deref:
+			visit_deref(ctx, nir_instr_as_deref(instr));
+			break;
+		default:
+			fprintf(stderr, "Unknown NIR instr type: ");
+			nir_print_instr(instr, stderr);
+			fprintf(stderr, "\n");
+			abort();
+		}
+	}
+
+	_mesa_hash_table_insert(ctx->defs, block,
+				LLVMGetInsertBlock(ctx->ac.builder));
+}
+
+static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt)
+{
+	LLVMValueRef value = get_src(ctx, if_stmt->condition);
+
+	nir_block *then_block =
+		(nir_block *) exec_list_get_head(&if_stmt->then_list);
+
+	ac_build_uif(&ctx->ac, value, then_block->index);
+
+	visit_cf_list(ctx, &if_stmt->then_list);
+
+	if (!exec_list_is_empty(&if_stmt->else_list)) {
+		nir_block *else_block =
+			(nir_block *) exec_list_get_head(&if_stmt->else_list);
+
+		ac_build_else(&ctx->ac, else_block->index);
+		visit_cf_list(ctx, &if_stmt->else_list);
+	}
+
+	ac_build_endif(&ctx->ac, then_block->index);
+}
+
+static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop)
+{
+	nir_block *first_loop_block =
+		(nir_block *) exec_list_get_head(&loop->body);
+
+	ac_build_bgnloop(&ctx->ac, first_loop_block->index);
+
+	visit_cf_list(ctx, &loop->body);
+
+	ac_build_endloop(&ctx->ac, first_loop_block->index);
+}
+
+static void visit_cf_list(struct ac_nir_context *ctx,
+                          struct exec_list *list)
+{
+	foreach_list_typed(nir_cf_node, node, node, list)
+	{
+		switch (node->type) {
+		case nir_cf_node_block:
+			visit_block(ctx, nir_cf_node_as_block(node));
+			break;
+
+		case nir_cf_node_if:
+			visit_if(ctx, nir_cf_node_as_if(node));
+			break;
+
+		case nir_cf_node_loop:
+			visit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
+
+		default:
+			assert(0);
+		}
+	}
+}
+
+void
+ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
+			     struct ac_shader_abi *abi,
+			     struct nir_shader *nir,
+			     struct nir_variable *variable,
+			     gl_shader_stage stage)
+{
+	unsigned output_loc = variable->data.driver_location / 4;
+	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
+
+	/* tess ctrl has it's own load/store paths for outputs */
+	if (stage == MESA_SHADER_TESS_CTRL)
+		return;
+
+	if (stage == MESA_SHADER_VERTEX ||
+	    stage == MESA_SHADER_TESS_EVAL ||
+	    stage == MESA_SHADER_GEOMETRY) {
+		int idx = variable->data.location + variable->data.index;
+		if (idx == VARYING_SLOT_CLIP_DIST0) {
+			int length = nir->info.clip_distance_array_size +
+				     nir->info.cull_distance_array_size;
+
+			if (length > 4)
+				attrib_count = 2;
+			else
+				attrib_count = 1;
+		}
+	}
+
+	bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
+	LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
+	for (unsigned i = 0; i < attrib_count; ++i) {
+		for (unsigned chan = 0; chan < 4; chan++) {
+			abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] =
+		                       ac_build_alloca_undef(ctx, type, "");
+		}
+	}
+}
+
+static void
+setup_locals(struct ac_nir_context *ctx,
+	     struct nir_function *func)
+{
+	int i, j;
+	ctx->num_locals = 0;
+	nir_foreach_variable(variable, &func->impl->locals) {
+		unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
+		variable->data.driver_location = ctx->num_locals * 4;
+		variable->data.location_frac = 0;
+		ctx->num_locals += attrib_count;
+	}
+	ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
+	if (!ctx->locals)
+	    return;
+
+	for (i = 0; i < ctx->num_locals; i++) {
+		for (j = 0; j < 4; j++) {
+			ctx->locals[i * 4 + j] =
+				ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp");
+		}
+	}
+}
+
+static void
+setup_scratch(struct ac_nir_context *ctx,
+	      struct nir_shader *shader)
+{
+	if (shader->scratch_size == 0)
+		return;
+
+	ctx->scratch = ac_build_alloca_undef(&ctx->ac,
+					     LLVMArrayType(ctx->ac.i8, shader->scratch_size),
+					     "scratch");
+}
+
+static void
+setup_constant_data(struct ac_nir_context *ctx,
+		    struct nir_shader *shader)
+{
+	if (!shader->constant_data)
+		return;
+
+	LLVMValueRef data =
+		LLVMConstStringInContext(ctx->ac.context,
+					 shader->constant_data,
+					 shader->constant_data_size,
+					 true);
+	LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size);
+
+	/* We want to put the constant data in the CONST address space so that
+	 * we can use scalar loads. However, LLVM versions before 10 put these
+	 * variables in the same section as the code, which is unacceptable
+	 * for RadeonSI as it needs to relocate all the data sections after
+	 * the code sections. See https://reviews.llvm.org/D65813.
+	 */
+	unsigned address_space =
+		LLVM_VERSION_MAJOR < 10 ? AC_ADDR_SPACE_GLOBAL : AC_ADDR_SPACE_CONST;
+
+	LLVMValueRef global =
+		LLVMAddGlobalInAddressSpace(ctx->ac.module, type,
+					    "const_data",
+					    address_space);
+
+	LLVMSetInitializer(global, data);
+	LLVMSetGlobalConstant(global, true);
+	LLVMSetVisibility(global, LLVMHiddenVisibility);
+	ctx->constant_data = global;
+}
+
+static void
+setup_shared(struct ac_nir_context *ctx,
+	     struct nir_shader *nir)
+{
+	nir_foreach_variable(variable, &nir->shared) {
+		LLVMValueRef shared =
+			LLVMAddGlobalInAddressSpace(
+			   ctx->ac.module, glsl_to_llvm_type(&ctx->ac, variable->type),
+			   variable->name ? variable->name : "",
+			   AC_ADDR_SPACE_LDS);
+		_mesa_hash_table_insert(ctx->vars, variable, shared);
+	}
+}
+
+void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
+		      struct nir_shader *nir)
+{
+	struct ac_nir_context ctx = {};
+	struct nir_function *func;
+
+	ctx.ac = *ac;
+	ctx.abi = abi;
+
+	ctx.stage = nir->info.stage;
+	ctx.info = &nir->info;
+
+	ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
+
+	nir_foreach_variable(variable, &nir->outputs)
+		ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
+					     ctx.stage);
+
+	ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+	                                   _mesa_key_pointer_equal);
+	ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+	                                   _mesa_key_pointer_equal);
+	ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+	                                   _mesa_key_pointer_equal);
+
+	func = (struct nir_function *)exec_list_get_head(&nir->functions);
+
+	nir_index_ssa_defs(func->impl);
+	ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef));
+
+	setup_locals(&ctx, func);
+	setup_scratch(&ctx, nir);
+	setup_constant_data(&ctx, nir);
+
+	if (gl_shader_stage_is_compute(nir->info.stage))
+		setup_shared(&ctx, nir);
+
+	visit_cf_list(&ctx, &func->impl->body);
+	phi_post_pass(&ctx);
+
+	if (!gl_shader_stage_is_compute(nir->info.stage))
+		ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS,
+				      ctx.abi->outputs);
+
+	free(ctx.locals);
+	free(ctx.ssa_defs);
+	ralloc_free(ctx.defs);
+	ralloc_free(ctx.phis);
+	ralloc_free(ctx.vars);
+}
+
+void
+ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class)
+{
+	/* Lower large variables to scratch first so that we won't bloat the
+	 * shader by generating large if ladders for them. We later lower
+	 * scratch to alloca's, assuming LLVM won't generate VGPR indexing.
+	 */
+	NIR_PASS_V(nir, nir_lower_vars_to_scratch,
+		   nir_var_function_temp,
+		   256,
+		   glsl_get_natural_size_align_bytes);
+
+	/* While it would be nice not to have this flag, we are constrained
+	 * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
+	 */
+	bool llvm_has_working_vgpr_indexing = chip_class != GFX9;
+
+	/* TODO: Indirect indexing of GS inputs is unimplemented.
+	 *
+	 * TCS and TES load inputs directly from LDS or offchip memory, so
+	 * indirect indexing is trivial.
+	 */
+	nir_variable_mode indirect_mask = 0;
+	if (nir->info.stage == MESA_SHADER_GEOMETRY ||
+	    (nir->info.stage != MESA_SHADER_TESS_CTRL &&
+	     nir->info.stage != MESA_SHADER_TESS_EVAL &&
+	     !llvm_has_working_vgpr_indexing)) {
+		indirect_mask |= nir_var_shader_in;
+	}
+	if (!llvm_has_working_vgpr_indexing &&
+	    nir->info.stage != MESA_SHADER_TESS_CTRL)
+		indirect_mask |= nir_var_shader_out;
+
+	/* TODO: We shouldn't need to do this, however LLVM isn't currently
+	 * smart enough to handle indirects without causing excess spilling
+	 * causing the gpu to hang.
+	 *
+	 * See the following thread for more details of the problem:
+	 * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
+	 */
+	indirect_mask |= nir_var_function_temp;
+
+	nir_lower_indirect_derefs(nir, indirect_mask);
+}
+
+static unsigned
+get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
+{
+	if (intrin->intrinsic != nir_intrinsic_store_deref)
+		return 0;
+
+	nir_variable *var =
+		nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
+
+	if (var->data.mode != nir_var_shader_out)
+		return 0;
+
+	unsigned writemask = 0;
+	const int location = var->data.location;
+	unsigned first_component = var->data.location_frac;
+	unsigned num_comps = intrin->dest.ssa.num_components;
+
+	if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+		writemask = ((1 << (num_comps + 1)) - 1) << first_component;
+	else if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+		writemask = (((1 << (num_comps + 1)) - 1) << first_component) << 4;
+
+	return writemask;
+}
+
+static void
+scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask,
+	       unsigned *cond_block_tf_writemask,
+	       bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf)
+{
+	switch (cf_node->type) {
+	case nir_cf_node_block: {
+		nir_block *block = nir_cf_node_as_block(cf_node);
+		nir_foreach_instr(instr, block) {
+			if (instr->type != nir_instr_type_intrinsic)
+				continue;
+
+			nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+			if (intrin->intrinsic == nir_intrinsic_barrier) {
+
+				/* If we find a barrier in nested control flow put this in the
+				 * too hard basket. In GLSL this is not possible but it is in
+				 * SPIR-V.
+				 */
+				if (is_nested_cf) {
+					*tessfactors_are_def_in_all_invocs = false;
+					return;
+				}
+
+				/* The following case must be prevented:
+				 *    gl_TessLevelInner = ...;
+				 *    barrier();
+				 *    if (gl_InvocationID == 1)
+				 *       gl_TessLevelInner = ...;
+				 *
+				 * If you consider disjoint code segments separated by barriers, each
+				 * such segment that writes tess factor channels should write the same
+				 * channels in all codepaths within that segment.
+				 */
+				if (upper_block_tf_writemask || cond_block_tf_writemask) {
+					/* Accumulate the result: */
+					*tessfactors_are_def_in_all_invocs &=
+						!(*cond_block_tf_writemask & ~(*upper_block_tf_writemask));
+
+					/* Analyze the next code segment from scratch. */
+					*upper_block_tf_writemask = 0;
+					*cond_block_tf_writemask = 0;
+				}
+			} else
+				*upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin);
+		}
+
+		break;
+	}
+	case nir_cf_node_if: {
+		unsigned then_tessfactor_writemask = 0;
+		unsigned else_tessfactor_writemask = 0;
+
+		nir_if *if_stmt = nir_cf_node_as_if(cf_node);
+		foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) {
+			scan_tess_ctrl(nested_node, &then_tessfactor_writemask,
+				       cond_block_tf_writemask,
+				       tessfactors_are_def_in_all_invocs, true);
+		}
+
+		foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) {
+			scan_tess_ctrl(nested_node, &else_tessfactor_writemask,
+				       cond_block_tf_writemask,
+				       tessfactors_are_def_in_all_invocs, true);
+		}
+
+		if (then_tessfactor_writemask || else_tessfactor_writemask) {
+			/* If both statements write the same tess factor channels,
+			 * we can say that the upper block writes them too.
+			 */
+			*upper_block_tf_writemask |= then_tessfactor_writemask &
+				else_tessfactor_writemask;
+			*cond_block_tf_writemask |= then_tessfactor_writemask |
+				else_tessfactor_writemask;
+		}
+
+		break;
+	}
+	case nir_cf_node_loop: {
+		nir_loop *loop = nir_cf_node_as_loop(cf_node);
+		foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) {
+			scan_tess_ctrl(nested_node, cond_block_tf_writemask,
+				       cond_block_tf_writemask,
+				       tessfactors_are_def_in_all_invocs, true);
+		}
+
+		break;
+	}
+	default:
+		unreachable("unknown cf node type");
+	}
+}
+
+bool
+ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir)
+{
+	assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
+
+	/* The pass works as follows:
+	 * If all codepaths write tess factors, we can say that all
+	 * invocations define tess factors.
+	 *
+	 * Each tess factor channel is tracked separately.
+	 */
+	unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */
+	unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */
+
+	/* Initial value = true. Here the pass will accumulate results from
+	 * multiple segments surrounded by barriers. If tess factors aren't
+	 * written at all, it's a shader bug and we don't care if this will be
+	 * true.
+	 */
+	bool tessfactors_are_def_in_all_invocs = true;
+
+	nir_foreach_function(function, nir) {
+		if (function->impl) {
+			foreach_list_typed(nir_cf_node, node, node, &function->impl->body) {
+				scan_tess_ctrl(node, &main_block_tf_writemask,
+					       &cond_block_tf_writemask,
+					       &tessfactors_are_def_in_all_invocs,
+					       false);
+			}
+		}
+	}
+
+	/* Accumulate the result for the last code segment separated by a
+	 * barrier.
+	 */
+	if (main_block_tf_writemask || cond_block_tf_writemask) {
+		tessfactors_are_def_in_all_invocs &=
+			!(cond_block_tf_writemask & ~main_block_tf_writemask);
+	}
+
+	return tessfactors_are_def_in_all_invocs;
+}
diff --git a/src/amd/llvm/ac_nir_to_llvm.h b/src/amd/llvm/ac_nir_to_llvm.h
new file mode 100644
index 00000000000..4782d9fc9d6
--- /dev/null
+++ b/src/amd/llvm/ac_nir_to_llvm.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef AC_NIR_TO_LLVM_H
+#define AC_NIR_TO_LLVM_H
+
+#include <stdbool.h>
+#include "llvm-c/Core.h"
+#include "llvm-c/TargetMachine.h"
+#include "amd_family.h"
+#include "compiler/shader_enums.h"
+
+struct nir_shader;
+struct nir_variable;
+struct ac_llvm_context;
+struct ac_shader_abi;
+
+/* Interpolation locations */
+#define INTERP_CENTER 0
+#define INTERP_CENTROID 1
+#define INTERP_SAMPLE 2
+
+static inline unsigned ac_llvm_reg_index_soa(unsigned index, unsigned chan)
+{
+	return (index * 4) + chan;
+}
+
+void ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class);
+
+bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir);
+
+void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
+		      struct nir_shader *nir);
+
+void
+ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
+			     struct ac_shader_abi *abi,
+			     struct nir_shader *nir,
+			     struct nir_variable *variable,
+			     gl_shader_stage stage);
+
+void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage);
+
+#endif /* AC_NIR_TO_LLVM_H */
diff --git a/src/amd/llvm/ac_shader_abi.h b/src/amd/llvm/ac_shader_abi.h
new file mode 100644
index 00000000000..61f1b735c49
--- /dev/null
+++ b/src/amd/llvm/ac_shader_abi.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AC_SHADER_ABI_H
+#define AC_SHADER_ABI_H
+
+#include <llvm-c/Core.h>
+
+#include "compiler/shader_enums.h"
+
+struct nir_variable;
+
+#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
+
+#define AC_MAX_INLINE_PUSH_CONSTS 8
+
+enum ac_descriptor_type {
+	AC_DESC_IMAGE,
+	AC_DESC_FMASK,
+	AC_DESC_SAMPLER,
+	AC_DESC_BUFFER,
+	AC_DESC_PLANE_0,
+	AC_DESC_PLANE_1,
+	AC_DESC_PLANE_2,
+};
+
+/* Document the shader ABI during compilation. This is what allows radeonsi and
+ * radv to share a compiler backend.
+ */
+struct ac_shader_abi {
+	LLVMValueRef base_vertex;
+	LLVMValueRef start_instance;
+	LLVMValueRef draw_id;
+	LLVMValueRef vertex_id;
+	LLVMValueRef instance_id;
+	LLVMValueRef tcs_patch_id;
+	LLVMValueRef tcs_rel_ids;
+	LLVMValueRef tes_patch_id;
+	LLVMValueRef gs_prim_id;
+	LLVMValueRef gs_invocation_id;
+
+	/* PS */
+	LLVMValueRef frag_pos[4];
+	LLVMValueRef front_face;
+	LLVMValueRef ancillary;
+	LLVMValueRef sample_coverage;
+	LLVMValueRef prim_mask;
+	LLVMValueRef color0;
+	LLVMValueRef color1;
+	LLVMValueRef user_data;
+	LLVMValueRef persp_sample;
+	LLVMValueRef persp_center;
+	LLVMValueRef persp_centroid;
+	LLVMValueRef linear_sample;
+	LLVMValueRef linear_center;
+	LLVMValueRef linear_centroid;
+
+	/* CS */
+	LLVMValueRef local_invocation_ids;
+	LLVMValueRef num_work_groups;
+	LLVMValueRef workgroup_ids[3];
+	LLVMValueRef tg_size;
+
+	/* Vulkan only */
+	LLVMValueRef push_constants;
+	LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
+	unsigned num_inline_push_consts;
+	unsigned base_inline_push_consts;
+	LLVMValueRef view_index;
+
+	LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
+
+	/* For VS and PS: pre-loaded shader inputs.
+	 *
+	 * Currently only used for NIR shaders; indexed by variables'
+	 * driver_location.
+	 */
+	LLVMValueRef *inputs;
+
+	/* Varying -> attribute number mapping. Also NIR-only */
+	unsigned fs_input_attr_indices[MAX_VARYING];
+
+	void (*emit_outputs)(struct ac_shader_abi *abi,
+			     unsigned max_outputs,
+			     LLVMValueRef *addrs);
+
+	void (*emit_vertex)(struct ac_shader_abi *abi,
+			    unsigned stream,
+			    LLVMValueRef *addrs);
+
+	void (*emit_primitive)(struct ac_shader_abi *abi,
+			       unsigned stream);
+
+	void (*emit_kill)(struct ac_shader_abi *abi, LLVMValueRef visible);
+
+	LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi,
+				    unsigned location,
+				    unsigned driver_location,
+				    unsigned component,
+				    unsigned num_components,
+				    unsigned vertex_index,
+				    unsigned const_index,
+				    LLVMTypeRef type);
+
+	LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
+					   LLVMTypeRef type,
+					   LLVMValueRef vertex_index,
+					   LLVMValueRef param_index,
+					   unsigned const_index,
+					   unsigned location,
+					   unsigned driver_location,
+					   unsigned component,
+					   unsigned num_components,
+					   bool is_patch,
+					   bool is_compact,
+					   bool load_inputs);
+
+	void (*store_tcs_outputs)(struct ac_shader_abi *abi,
+				  const struct nir_variable *var,
+				  LLVMValueRef vertex_index,
+				  LLVMValueRef param_index,
+				  unsigned const_index,
+				  LLVMValueRef src,
+				  unsigned writemask);
+
+	LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi);
+
+	LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi);
+
+	LLVMValueRef (*load_tess_level)(struct ac_shader_abi *abi,
+					unsigned varying_id,
+					bool load_default_state);
+
+
+	LLVMValueRef (*load_ubo)(struct ac_shader_abi *abi, LLVMValueRef index);
+
+	/**
+	 * Load the descriptor for the given buffer.
+	 *
+	 * \param buffer the buffer as presented in NIR: this is the descriptor
+	 *               in Vulkan, and the buffer index in OpenGL/Gallium
+	 * \param write whether buffer contents will be written
+	 */
+	LLVMValueRef (*load_ssbo)(struct ac_shader_abi *abi,
+				  LLVMValueRef buffer, bool write);
+
+	/**
+	 * Load a descriptor associated to a sampler.
+	 *
+	 * \param descriptor_set the descriptor set index (only for Vulkan)
+	 * \param base_index the base index of the sampler variable
+	 * \param constant_index constant part of an array index (or 0, if the
+	 *                       sampler variable is not an array)
+	 * \param index non-constant part of an array index (may be NULL)
+	 * \param desc_type the type of descriptor to load
+	 * \param image whether the descriptor is loaded for an image operation
+	 */
+	LLVMValueRef (*load_sampler_desc)(struct ac_shader_abi *abi,
+					  unsigned descriptor_set,
+					  unsigned base_index,
+					  unsigned constant_index,
+					  LLVMValueRef index,
+					  enum ac_descriptor_type desc_type,
+					  bool image, bool write,
+					  bool bindless);
+
+	/**
+	 * Load a Vulkan-specific resource.
+	 *
+	 * \param index resource index
+	 * \param desc_set descriptor set
+	 * \param binding descriptor set binding
+	 */
+	LLVMValueRef (*load_resource)(struct ac_shader_abi *abi,
+				      LLVMValueRef index,
+				      unsigned desc_set,
+				      unsigned binding);
+
+	LLVMValueRef (*load_sample_position)(struct ac_shader_abi *abi,
+					     LLVMValueRef sample_id);
+
+	LLVMValueRef (*load_local_group_size)(struct ac_shader_abi *abi);
+
+	LLVMValueRef (*load_sample_mask_in)(struct ac_shader_abi *abi);
+
+	LLVMValueRef (*load_base_vertex)(struct ac_shader_abi *abi);
+
+	LLVMValueRef (*emit_fbfetch)(struct ac_shader_abi *abi);
+
+	/* Whether to clamp the shadow reference value to [0,1]on GFX8. Radeonsi currently
+	 * uses it due to promoting D16 to D32, but radv needs it off. */
+	bool clamp_shadow_reference;
+	bool interp_at_sample_force_center;
+
+	/* Whether bounds checks are required */
+	bool robust_buffer_access;
+};
+
+#endif /* AC_SHADER_ABI_H */
diff --git a/src/amd/llvm/meson.build b/src/amd/llvm/meson.build
new file mode 100644
index 00000000000..f9868cfac09
--- /dev/null
+++ b/src/amd/llvm/meson.build
@@ -0,0 +1,50 @@
+# Copyright © 2019 Valve Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+amd_common_llvm_files = files(
+  'ac_llvm_build.c',
+  'ac_llvm_build.h',
+  'ac_llvm_cull.c',
+  'ac_llvm_cull.h',
+  'ac_llvm_helper.cpp',
+  'ac_llvm_util.c',
+  'ac_llvm_util.h',
+  'ac_nir_to_llvm.c',
+  'ac_nir_to_llvm.h',
+  'ac_shader_abi.h',
+)
+
+libamd_common_llvm = static_library(
+  'amd_common_llvm',
+  [amd_common_llvm_files],
+  include_directories : [
+    inc_common, inc_compiler, inc_mesa, inc_mapi, inc_amd, inc_amd_common
+  ],
+  link_with: [
+    libamd_common
+  ],
+  dependencies : [
+    dep_llvm, dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind,
+    idep_nir_headers, idep_amdgfxregs_h,
+  ],
+  c_args : [c_vis_args],
+  cpp_args : [cpp_vis_args],
+)
+
author	Timur Kristóf <[email protected]>	2019-09-27 10:29:51 +0200
committer	Bas Nieuwenhuizen <[email protected]>	2019-10-08 00:44:08 +0000
commit	3a08110d43ce268747d034cae03787080967bf71 (patch)
tree	ee02037354a4a3846f9cb75df486b036345a5acc /src/amd/llvm
parent	738bbee603fd3fd8ea29edab7b681e48bc981467 (diff)