414 files changed, 16038 insertions, 7910 deletions
diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk
index a6712b2c115..ed51835e1fb 100644
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -28,7 +28,9 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(a2xx_SOURCES) \
-	$(a3xx_SOURCES)
+	$(a3xx_SOURCES)	\
+	$(a4xx_SOURCES) \
+	$(ir3_SOURCES)
 
 LOCAL_CFLAGS := \
 	-Wno-packed-bitfield-compat
@@ -37,6 +39,7 @@ LOCAL_C_INCLUDES := \
 	$(LOCAL_PATH)/ir3
 
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_freedreno
+LOCAL_STATIC_LIBRARIES := libmesa_glsl
 LOCAL_MODULE := libmesa_pipe_freedreno
 
 include $(GALLIUM_COMMON_MK)
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index cbf62c6daae..dff95ba5270 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index f4f6b94c1ea..c4516baf2ec 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
-
-Copyright (C) 2013-2014 by the following authors:
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
index 7cafcd3747e..3c8d8f7c09f 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
@@ -39,7 +39,7 @@ struct fd2_blend_stateobj {
 	uint32_t rb_colormask;
 };
 
-static INLINE struct fd2_blend_stateobj *
+static inline struct fd2_blend_stateobj *
 fd2_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd2_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index a0bf01ffd1f..6089ebc1516 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@@ -67,7 +67,7 @@ create_solid_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -77,7 +77,7 @@ static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
 };
 
 static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.h b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
index de845f07a85..74147107930 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
@@ -40,7 +40,7 @@ struct fd2_context {
 	struct pipe_resource *solid_vertexbuf;
 };
 
-static INLINE struct fd2_context *
+static inline struct fd2_context *
 fd2_context(struct fd_context *ctx)
 {
 	return (struct fd2_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
index adc0653132b..9e53cd3be75 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
@@ -43,7 +43,7 @@ struct fd2_rasterizer_stateobj {
 	uint32_t pa_su_sc_mode_cntl;
 };
 
-static INLINE struct fd2_rasterizer_stateobj *
+static inline struct fd2_rasterizer_stateobj *
 fd2_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd2_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
index 4fffa08b3c3..5c9236851bd 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
@@ -42,7 +42,7 @@ struct fd2_sampler_stateobj {
 	uint32_t tex0, tex3, tex4, tex5;
 };
 
-static INLINE struct fd2_sampler_stateobj *
+static inline struct fd2_sampler_stateobj *
 fd2_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd2_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd2_pipe_sampler_view {
 	uint32_t tex0, tex2, tex3;
 };
 
-static INLINE struct fd2_pipe_sampler_view *
+static inline struct fd2_pipe_sampler_view *
 fd2_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd2_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
index dda1e552174..15609ad0267 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
@@ -44,7 +44,7 @@ struct fd2_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd2_zsa_stateobj *
+static inline struct fd2_zsa_stateobj *
 fd2_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd2_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index a3bc74eda85..8e8cf6a03f2 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
index 4f6eeb74481..142df7c300f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
@@ -32,6 +32,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd3_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
@@ -42,10 +44,10 @@ struct fd3_blend_stateobj {
 		/* Blend control bits for alpha channel */
 		uint32_t blend_control_alpha;
 		uint32_t control;
-	} rb_mrt[4];
+	} rb_mrt[A3XX_MAX_RENDER_TARGETS];
 };
 
-static INLINE struct fd3_blend_stateobj *
+static inline struct fd3_blend_stateobj *
 fd3_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd3_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 7e5a99ea571..dc33783e398 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -88,7 +88,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -121,6 +121,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv)
 	fd3_gmem_init(pctx);
 	fd3_texture_init(pctx);
 	fd3_prog_init(pctx);
+	fd3_emit_init(pctx);
 
 	pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 77e4605e550..6e20b2ff9bc 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -112,7 +112,7 @@ struct fd3_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd3_context *
+static inline struct fd3_context *
 fd3_context(struct fd_context *ctx)
 {
 	return (struct fd3_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b5838b58eb2..a9498835011 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -60,6 +60,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 
+	if (!(fd3_emit_get_vp(emit) && fd3_emit_get_fp(emit)))
+		return;
+
 	fd3_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
@@ -79,8 +82,8 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			info->restart_index : 0xffffffff);
 
 	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
-		info->mode == PIPE_PRIM_POINTS)
-		primtype = DI_PT_POINTLIST_A2XX;
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
 
 	fd_draw_emit(ctx, ring,
 			primtype,
@@ -240,10 +243,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 		.vtx  = &fd3_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-							   fd3_half_precision(pfb->cbufs[1]) &&
-							   fd3_half_precision(pfb->cbufs[2]) &&
-							   fd3_half_precision(pfb->cbufs[3])),
+			.half_precision = fd_half_precision(pfb),
 		},
 	};
 
@@ -321,7 +321,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 				A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
 				A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS) |
@@ -342,7 +342,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 
 	fd3_emit_vertex_bufs(ring, &emit);
 
-	fd3_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd3_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 07cc2266d08..752e7f88cb9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -43,19 +43,26 @@
 #include "fd3_format.h"
 #include "fd3_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = SS_INDIRECT;
@@ -67,7 +74,7 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/2));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
-
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
-
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
-
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
+	uint32_t i;
 
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && constbuf->dirty_mask & (1 << index)) {
-			fd3_emit_constant(ring, sb, 0,
-							  cb->buffer_offset, size,
-							  cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		/* emit ubos: */
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) |
-				 CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				 CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				 CP_LOAD_STATE_0_NUM_UNIT(params * 2));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				 CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
-
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
-
-		if (size > 0) {
-			fd3_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/2));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -302,14 +251,15 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
 		for (i = 0; i < tex->num_textures; i++) {
 			static const struct fd3_pipe_sampler_view dummy_view = {
+					.base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */
 					.base.u.tex.first_level = 1,
 			};
 			const struct fd3_pipe_sampler_view *view = tex->textures[i] ?
 					fd3_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
 			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = view->base.u.tex.first_level;
-			unsigned end   = view->base.u.tex.last_level;
+			unsigned start = fd_sampler_first_level(&view->base);
+			unsigned end   = fd_sampler_last_level(&view->base);;
 
 			for (j = 0; j < (end - start + 1); j++) {
 				struct fd_resource_slice *slice =
@@ -392,6 +342,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
 			format = fd3_gmem_restore_format(rsc->base.b.format);
 		}
 
+		/* note: PIPE_BUFFER disallowed for surfaces */
 		unsigned lvl = psurf[i]->u.tex.level;
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
 
@@ -444,7 +395,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -452,14 +405,17 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask)
 			last = i;
 	}
 
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
-	if (vtx->vtx->num_elements == 0 &&
-		vertex_regid == regid(63, 0) &&
-		instance_regid == regid(63, 0))
+	if ((vtx->vtx->num_elements == 0) &&
+			(vertex_regid == regid(63, 0)) &&
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -472,8 +428,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			enum pipe_format pfmt = elem->src_format;
 			enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
-				vertex_regid != regid(63, 0) ||
-				instance_regid != regid(63, 0);
+					(vertex_regid != regid(63, 0)) ||
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 
@@ -512,6 +469,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
 			A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A3XX_VFD_CONTROL_1_REGID4INST(instance_regid));
+
+	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
+	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
+			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid));
 }
 
 void
@@ -669,33 +630,12 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_EVENT_WRITE, 1);
 	OUT_RING(ring, HLSQ_FLUSH);
 
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
-	}
-
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd3_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) {
@@ -930,3 +870,11 @@ fd3_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd3_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd3_emit_const;
+	ctx->emit_const_bo = fd3_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 8f21919c9a7..795654706a7 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -37,10 +37,8 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
@@ -90,4 +88,6 @@ void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd3_emit_restore(struct fd_context *ctx);
 
+void fd3_emit_init(struct pipe_context *pctx);
+
 #endif /* FD3_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
index 6afc3015901..05c5ea3d247 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
@@ -41,27 +41,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
 		unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
 
-static INLINE bool
-fd3_half_precision(const struct pipe_surface *surface)
-{
-	enum pipe_format format;
-	if (!surface)
-		return true;
-
-	format = surface->format;
-
-	/* colors are provided in consts, which go through cov.f32f16, which will
-	 * break these values
-	 */
-	if (util_format_is_pure_integer(format))
-		return false;
-
-	/* avoid losing precision on 32-bit float formats */
-	if (util_format_is_float(format) &&
-		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
-		return false;
-
-	return true;
-}
-
 #endif /* FD3_FORMAT_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 7d3975761dd..9a5b45e2fcb 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -57,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = LINEAR;
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		enum pipe_format pformat = 0;
 		enum a3xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
@@ -537,10 +537,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
 			.key = {
-				.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-								   fd3_half_precision(pfb->cbufs[1]) &&
-								   fd3_half_precision(pfb->cbufs[2]) &&
-								   fd3_half_precision(pfb->cbufs[3]))
+				.half_precision = fd_half_precision(pfb),
 			},
 	};
 	float x0, y0, x1, y1;
@@ -654,6 +651,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
 		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
 		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
 	}
@@ -674,6 +672,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				emit.prog = &ctx->blit_zs;
 			emit.key.half_precision = false;
 		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, 1, &pfb->zsbuf);
 		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
 	}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 57fcaa9020e..b5360797745 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -51,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
@@ -136,6 +136,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	vp = fd3_emit_get_vp(emit);
 
 	if (emit->key.binning_pass) {
@@ -202,12 +204,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
 			ir3_find_output_regid(fp, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
 	} else {
-		for (int i = 0; i < fp->outputs_count; i++) {
+		for (i = 0; i < fp->outputs_count; i++) {
 			ir3_semantic sem = fp->outputs[i].semantic;
 			unsigned idx = sem2idx(sem);
 			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
 				continue;
-			assert(idx < 4);
+			debug_assert(idx < ARRAY_SIZE(color_regid));
 			color_regid[idx] = fp->outputs[i].regid;
 		}
 	}
@@ -449,10 +451,6 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
 	}
 
-	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
-	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
-			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));
-
 	if (vpbuffer == BUFFER)
 		emit_shader(ring, vp);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
index 7abab543427..8fc0a0d4229 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -64,7 +64,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 
 	OUT_PKT3(ring, CP_DRAW_INDX, 3);
 	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, DRAW(DI_PT_POINTLIST_A2XX, DI_SRC_SEL_AUTO_INDEX,
+	OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
 						INDEX_SIZE_IGN, USE_VISIBILITY, 0));
 	OUT_RING(ring, 0);             /* NumIndices */
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
index 7e9c1f51f59..765d9719524 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
@@ -44,7 +44,7 @@ struct fd3_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd3_rasterizer_stateobj *
+static inline struct fd3_rasterizer_stateobj *
 fd3_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd3_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 094dcf376e5..722fe360202 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -105,7 +105,7 @@ void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 4;
+	screen->max_rts = A3XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index a278bf5c603..c30658d0e7b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -210,8 +210,8 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 	uint32_t sz2 = 0;
 
 	if (!so)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
index c38fd847f27..d5afb03cd7a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
@@ -43,7 +43,7 @@ struct fd3_sampler_stateobj {
 	bool saturate_s, saturate_t, saturate_r;
 };
 
-static INLINE struct fd3_sampler_stateobj *
+static inline struct fd3_sampler_stateobj *
 fd3_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd3_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd3_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3;
 };
 
-static INLINE struct fd3_pipe_sampler_view *
+static inline struct fd3_pipe_sampler_view *
 fd3_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd3_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
index 352c3dd5432..d4dc5954da5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
@@ -45,7 +45,7 @@ struct fd3_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd3_zsa_stateobj *
+static inline struct fd3_zsa_stateobj *
 fd3_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd3_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 0e7d3cf6db1..563f70ac5eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
@@ -227,6 +227,7 @@ enum a4xx_depth_format {
 	DEPTH4_NONE = 0,
 	DEPTH4_16 = 1,
 	DEPTH4_24_8 = 2,
+	DEPTH4_32 = 3,
 };
 
 enum a4xx_tess_spacing {
@@ -570,6 +571,15 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
 	return ((val) << A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT) & A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK;
 }
 
+#define REG_A4XX_RB_SAMPLE_COUNT_CONTROL			0x000020fa
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_COPY			0x00000002
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK			0xfffffffc
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT		2
+static inline uint32_t A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR(uint32_t val)
+{
+	return ((val >> 2) << A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT) & A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK;
+}
+
 #define REG_A4XX_RB_RENDER_COMPONENTS				0x000020fb
 #define A4XX_RB_RENDER_COMPONENTS_RT0__MASK			0x0000000f
 #define A4XX_RB_RENDER_COMPONENTS_RT0__SHIFT			0
@@ -811,6 +821,23 @@ static inline uint32_t A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(enum adreno_stencil_op v
 #define REG_A4XX_RB_STENCIL_CONTROL2				0x00002107
 #define A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER			0x00000001
 
+#define REG_A4XX_RB_STENCIL_INFO				0x00002108
+#define A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL			0x00000001
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK			0xfffff000
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT		12
+static inline uint32_t A4XX_RB_STENCIL_INFO_STENCIL_BASE(uint32_t val)
+{
+	return ((val >> 12) << A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT) & A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK;
+}
+
+#define REG_A4XX_RB_STENCIL_PITCH				0x00002109
+#define A4XX_RB_STENCIL_PITCH__MASK				0xffffffff
+#define A4XX_RB_STENCIL_PITCH__SHIFT				0
+static inline uint32_t A4XX_RB_STENCIL_PITCH(uint32_t val)
+{
+	return ((val >> 5) << A4XX_RB_STENCIL_PITCH__SHIFT) & A4XX_RB_STENCIL_PITCH__MASK;
+}
+
 #define REG_A4XX_RB_STENCILREFMASK				0x0000210b
 #define A4XX_RB_STENCILREFMASK_STENCILREF__MASK			0x000000ff
 #define A4XX_RB_STENCILREFMASK_STENCILREF__SHIFT		0
@@ -1167,6 +1194,8 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578
 
 #define REG_A4XX_SP_VS_STATUS					0x00000ec0
 
+#define REG_A4XX_SP_MODE_CONTROL				0x00000ec3
+
 #define REG_A4XX_SP_PERFCTR_SP_SEL_11				0x00000ecf
 
 #define REG_A4XX_SP_SP_CTRL_REG					0x000022c0
@@ -1432,6 +1461,20 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
 	return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
 }
 
+#define REG_A4XX_SP_CS_CTRL_REG0				0x00002300
+
+#define REG_A4XX_SP_CS_OBJ_OFFSET_REG				0x00002301
+
+#define REG_A4XX_SP_CS_OBJ_START				0x00002302
+
+#define REG_A4XX_SP_CS_PVT_MEM_PARAM				0x00002303
+
+#define REG_A4XX_SP_CS_PVT_MEM_ADDR				0x00002304
+
+#define REG_A4XX_SP_CS_PVT_MEM_SIZE				0x00002305
+
+#define REG_A4XX_SP_CS_LENGTH_REG				0x00002306
+
 #define REG_A4XX_SP_HS_OBJ_OFFSET_REG				0x0000230d
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1454,6 +1497,76 @@ static inline uint32_t A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_HS_LENGTH_REG				0x00002312
 
+#define REG_A4XX_SP_DS_PARAM_REG				0x0000231a
+#define A4XX_SP_DS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_DS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_OUT(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_OUT_REG(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+#define A4XX_SP_DS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_DS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_DS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST_REG(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_DS_OBJ_OFFSET_REG				0x00002334
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1476,6 +1589,82 @@ static inline uint32_t A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_DS_LENGTH_REG				0x00002339
 
+#define REG_A4XX_SP_GS_PARAM_REG				0x00002341
+#define A4XX_SP_GS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK			0x0000ff00
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT			8
+static inline uint32_t A4XX_SP_GS_PARAM_REG_PRIMREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_OUT(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_OUT_REG(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+#define A4XX_SP_GS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_GS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_GS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST_REG(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_GS_OBJ_OFFSET_REG				0x0000235b
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1677,6 +1866,18 @@ static inline uint32_t A4XX_VFD_CONTROL_3_REGID_VTXCNT(uint32_t val)
 {
 	return ((val) << A4XX_VFD_CONTROL_3_REGID_VTXCNT__SHIFT) & A4XX_VFD_CONTROL_3_REGID_VTXCNT__MASK;
 }
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__MASK			0x00ff0000
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT			16
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSX(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSX__MASK;
+}
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__MASK			0xff000000
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT			24
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSY(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSY__MASK;
+}
 
 #define REG_A4XX_VFD_CONTROL_4					0x00002204
 
@@ -1758,6 +1959,8 @@ static inline uint32_t A4XX_VFD_DECODE_INSTR_SHIFTCNT(uint32_t val)
 
 #define REG_A4XX_TPL1_DEBUG_ECO_CONTROL				0x00000f00
 
+#define REG_A4XX_TPL1_TP_MODE_CONTROL				0x00000f03
+
 #define REG_A4XX_TPL1_PERFCTR_TP_SEL_7				0x00000f0b
 
 #define REG_A4XX_TPL1_TP_TEX_OFFSET				0x00002380
@@ -1800,6 +2003,10 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)
 
 #define REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR		0x000023a1
 
+#define REG_A4XX_TPL1_TP_CS_BORDER_COLOR_BASE_ADDR		0x000023a4
+
+#define REG_A4XX_TPL1_TP_CS_SAMPLER_BASE_ADDR			0x000023a5
+
 #define REG_A4XX_TPL1_TP_CS_TEXMEMOBJ_BASE_ADDR			0x000023a6
 
 #define REG_A4XX_GRAS_TSE_STATUS				0x00000c80
@@ -2078,6 +2285,8 @@ static inline uint32_t A4XX_GRAS_SC_EXTENT_WINDOW_TL_Y(uint32_t val)
 
 #define REG_A4XX_HLSQ_DEBUG_ECO_CONTROL				0x00000e04
 
+#define REG_A4XX_HLSQ_MODE_CONTROL				0x00000e05
+
 #define REG_A4XX_HLSQ_PERF_PIPE_MASK				0x00000e0e
 
 #define REG_A4XX_HLSQ_CONTROL_0_REG				0x000023c0
@@ -2158,6 +2367,8 @@ static inline uint32_t A4XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val)
 	return ((val) << A4XX_HLSQ_CONTROL_3_REG_REGID__SHIFT) & A4XX_HLSQ_CONTROL_3_REG_REGID__MASK;
 }
 
+#define REG_A4XX_HLSQ_CONTROL_4_REG				0x000023c4
+
 #define REG_A4XX_HLSQ_VS_CONTROL_REG				0x000023c5
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK		0x000000ff
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT		0
@@ -2293,6 +2504,36 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 	return ((val) << A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__SHIFT) & A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__MASK;
 }
 
+#define REG_A4XX_HLSQ_CS_CONTROL				0x000023ca
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_0				0x000023cd
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_1				0x000023ce
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_2				0x000023cf
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_3				0x000023d0
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_4				0x000023d1
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_5				0x000023d2
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_6				0x000023d3
+
+#define REG_A4XX_HLSQ_CL_CONTROL_0				0x000023d4
+
+#define REG_A4XX_HLSQ_CL_CONTROL_1				0x000023d5
+
+#define REG_A4XX_HLSQ_CL_KERNEL_CONST				0x000023d6
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_X				0x000023d7
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Y				0x000023d8
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Z				0x000023d9
+
+#define REG_A4XX_HLSQ_CL_WG_OFFSET				0x000023da
+
 #define REG_A4XX_HLSQ_UPDATE_CONTROL				0x000023db
 
 #define REG_A4XX_PC_BINNING_COMMAND				0x00000d00
@@ -2389,16 +2630,10 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
 
 #define REG_A4XX_UNKNOWN_0D01					0x00000d01
 
-#define REG_A4XX_UNKNOWN_0E05					0x00000e05
-
 #define REG_A4XX_UNKNOWN_0E42					0x00000e42
 
 #define REG_A4XX_UNKNOWN_0EC2					0x00000ec2
 
-#define REG_A4XX_UNKNOWN_0EC3					0x00000ec3
-
-#define REG_A4XX_UNKNOWN_0F03					0x00000f03
-
 #define REG_A4XX_UNKNOWN_2001					0x00002001
 
 #define REG_A4XX_UNKNOWN_209B					0x0000209b
@@ -2439,6 +2674,8 @@ static inline uint32_t A4XX_UNKNOWN_20F7(float val)
 
 #define REG_A4XX_UNKNOWN_22D7					0x000022d7
 
+#define REG_A4XX_UNKNOWN_2352					0x00002352
+
 #define REG_A4XX_TEX_SAMP_0					0x00000000
 #define A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR			0x00000001
 #define A4XX_TEX_SAMP_0_XY_MAG__MASK				0x00000006
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
index 396caa532fc..d5e823ef69d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -61,7 +61,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	struct fd4_blend_stateobj *so;
 //	enum a3xx_rop_code rop = ROP_COPY;
 	bool reads_dest = false;
-	int i;
+	unsigned i, mrt_blend = 0;
 
 	if (cso->logicop_enable) {
 //		rop = cso->logicop_func;  /* maps 1:1 */
@@ -84,11 +84,6 @@ fd4_blend_state_create(struct pipe_context *pctx,
 		}
 	}
 
-	if (cso->independent_blend_enable) {
-		DBG("Unsupported! independent blend state");
-		return NULL;
-	}
-
 	so = CALLOC_STRUCT(fd4_blend_stateobj);
 	if (!so)
 		return NULL;
@@ -96,7 +91,12 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	so->base = *cso;
 
 	for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) {
-		const struct pipe_rt_blend_state *rt = &cso->rt[i];
+		const struct pipe_rt_blend_state *rt;
+
+		if (cso->independent_blend_enable)
+			rt = &cso->rt[i];
+		else
+			rt = &cso->rt[0];
 
 		so->rb_mrt[i].blend_control =
 				A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) |
@@ -115,7 +115,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 					A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE |
 					A4XX_RB_MRT_CONTROL_BLEND |
 					A4XX_RB_MRT_CONTROL_BLEND2;
-			so->rb_fs_output |= A4XX_RB_FS_OUTPUT_ENABLE_BLEND(1);
+			mrt_blend |= (1 << i);
 		}
 
 		if (reads_dest)
@@ -125,5 +125,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}
 
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
index 33641da5e2c..7620d00a625 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
@@ -32,17 +32,19 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd4_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
 		uint32_t control;
 		uint32_t buf_info;
 		uint32_t blend_control;
-	} rb_mrt[8];
+	} rb_mrt[A4XX_MAX_RENDER_TARGETS];
 	uint32_t rb_fs_output;
 };
 
-static INLINE struct fd4_blend_stateobj *
+static inline struct fd4_blend_stateobj *
 fd4_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd4_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 2321876dd48..e172d350517 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -86,7 +86,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -119,6 +119,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv)
 	fd4_gmem_init(pctx);
 	fd4_texture_init(pctx);
 	fd4_prog_init(pctx);
+	fd4_emit_init(pctx);
 
 	pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 53e1bf6a2e6..0b749916841 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -90,7 +90,7 @@ struct fd4_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd4_context *
+static inline struct fd4_context *
 fd4_context(struct fd_context *ctx)
 {
 	return (struct fd4_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index de5a306af60..2bd2ca23d54 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -48,6 +48,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	const struct pipe_draw_info *info = emit->info;
 
+	if (!(fd4_emit_get_vp(emit) && fd4_emit_get_fp(emit)))
+		return;
+
 	fd4_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
@@ -108,7 +111,6 @@ static void
 fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 		.vtx  = &ctx->vtx,
 		.prog = &ctx->prog,
@@ -129,8 +131,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
-		.pformat = pipe_surface_format(pfb->cbufs[0]),
+		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false,
+		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
 	};
 	unsigned dirty;
 
@@ -170,20 +173,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	unsigned dirty = ctx->dirty;
-	unsigned ce, i;
+	unsigned i;
 	struct fd4_emit emit = {
 		.vtx  = &fd4_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = true,
+			.half_precision = fd_half_precision(pfb),
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
 	};
-	uint32_t colr = 0;
-
-	if ((buffers & PIPE_CLEAR_COLOR) && pfb->nr_cbufs)
-		colr  = pack_rgba(pfb->cbufs[0]->format, color->f);
 
 	dirty &= FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
 	dirty |= FD_DIRTY_PROG;
@@ -257,16 +256,15 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
 		OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER));
-		ce = 0xf;
-	} else {
-		ce = 0x0;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (buffers & (PIPE_CLEAR_COLOR0 << i)) ? 0xf : 0x0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
-				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(ce));
+				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) |
@@ -277,6 +275,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
@@ -285,14 +293,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_RB_CLEAR_COLOR_DW0, 4);
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW0 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW1 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW2 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW3 */
-
 	/* until fastclear works: */
-	fd4_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd4_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index 1bd376ca6ec..b89a30a7c4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -106,6 +106,7 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	struct pipe_index_buffer *idx = &ctx->indexbuf;
 	struct fd_bo *idx_bo = NULL;
+	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
@@ -126,7 +127,12 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		src_sel = DI_SRC_SEL_AUTO_INDEX;
 	}
 
-	fd4_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+	/* points + psize -> spritelist: */
+	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
+
+	fd4_draw(ctx, ring, primtype, vismode, src_sel,
 			info->count, info->instance_count,
 			idx_type, idx_size, idx_offset, idx_bo);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 4b6eb646aa7..b75be29e523 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -43,19 +43,26 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = 0x2;  // TODO ??
@@ -67,7 +74,7 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
-
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
-
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
-
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
-
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && (constbuf->dirty_mask & (1 << index))) {
-			fd4_emit_constant(ring, sb, 0,
-					cb->buffer_offset, size,
-					cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	/* emit ubos: */
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(params));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
+	uint32_t i;
 
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-		if (size > 0) {
-			fd4_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/4));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -223,15 +172,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
 					fd4_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
-			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = view->base.u.tex.first_level;
-			uint32_t offset = fd_resource_offset(rsc, start, 0);
+			unsigned start = fd_sampler_first_level(&view->base);
 
 			OUT_RING(ring, view->texconst0);
 			OUT_RING(ring, view->texconst1);
 			OUT_RING(ring, view->texconst2);
 			OUT_RING(ring, view->texconst3);
-			OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			if (view->base.texture) {
+				struct fd_resource *rsc = fd_resource(view->base.texture);
+				uint32_t offset = fd_resource_offset(rsc, start, 0);
+				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			} else {
+				OUT_RING(ring, 0x00000000);
+			}
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);
@@ -244,51 +197,110 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * special cases..
  */
 void
-fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf)
+fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
+		struct pipe_surface **bufs)
 {
-	struct fd_resource *rsc = fd_resource(psurf->texture);
-	unsigned lvl = psurf->u.tex.level;
-	struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
-	uint32_t offset = fd_resource_offset(rsc, lvl, psurf->u.tex.first_layer);
-	enum pipe_format format = fd4_gmem_restore_format(psurf->format);
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS];
+	int i;
 
-	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (i < nr_bufs) ? 0xf : 0;
+	}
 
 	/* output sampler state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 4);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
+		OUT_RING(ring, 0x00000000);
+	}
 
 	/* emit texture state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 10);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
-			A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
-			fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
-					PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
-	OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(psurf->width) |
-			A4XX_TEX_CONST_1_HEIGHT(psurf->height));
-	OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
-	OUT_RING(ring, 0x00000000);
-	OUT_RELOC(ring, rsc->bo, offset, 0, 0);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		if (bufs[i]) {
+			struct fd_resource *rsc = fd_resource(bufs[i]->texture);
+			/* note: PIPE_BUFFER disallowed for surfaces */
+			unsigned lvl = bufs[i]->u.tex.level;
+			struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
+			uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer);
+			enum pipe_format format = fd4_gmem_restore_format(bufs[i]->format);
+
+			/* The restore blit_zs shader expects stencil in sampler 0,
+			 * and depth in sampler 1
+			 */
+			if (rsc->stencil && (i == 0)) {
+				rsc = rsc->stencil;
+				format = fd4_gmem_restore_format(rsc->base.b.format);
+			}
+
+			/* z32 restore is accomplished using depth write.  If there is
+			 * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+			 * then no render target:
+			 *
+			 * (The same applies for z32_s8x24, since for stencil sampler
+			 * state the above 'if' will replace 'format' with s8)
+			 */
+			if ((format == PIPE_FORMAT_Z32_FLOAT) ||
+					(format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT))
+				mrt_comp[i] = 0;
+
+			debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer);
+
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
+							PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) |
+					A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
+			OUT_RING(ring, 0x00000000);
+			OUT_RELOC(ring, rsc->bo, offset, 0, 0);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		} else {
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) |
+					A4XX_TEX_CONST_1_HEIGHT(0));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0));
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+	}
+
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
 }
 
 void
@@ -298,7 +310,9 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -306,6 +320,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if ((i < vtx->vtx->num_elements) && vp->inputs[i].compmask)
 			last = i;
 	}
@@ -313,7 +329,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
 	if ((vtx->vtx->num_elements == 0) &&
 			(vertex_regid == regid(63, 0)) &&
-			(instance_regid == regid(63, 0)))
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -327,7 +344,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
 					(vertex_regid != regid(63, 0)) ||
-					(instance_regid != regid(63, 0));
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 			uint32_t off = vb->buffer_offset + elem->src_offset;
@@ -368,7 +386,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A4XX_VFD_CONTROL_1_REGID4INST(instance_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_2 */
-	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(regid(63, 0)));
+	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_4 */
 
 	/* cache invalidate, otherwise vertex fetch could see
@@ -389,6 +407,25 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	emit_marker(ring, 5);
 
+	if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->key.binning_pass) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
+
+		for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+			mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+		}
+
+		OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+		OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+				A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+				A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+				A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+				A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+				A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+				A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+				A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+	}
+
 	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) {
 		uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control;
 
@@ -513,43 +550,24 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
 	}
 
-	if (dirty & FD_DIRTY_PROG)
-		fd4_program_emit(ring, emit);
-
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
+	if (dirty & FD_DIRTY_PROG) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs);
 	}
 
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd4_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
 		struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
 		uint32_t i;
 
-		for (i = 0; i < 8; i++) {
+		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 			OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 			OUT_RING(ring, blend->rb_mrt[i].control);
 
@@ -607,10 +625,10 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC3, 1);
+	OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000006);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0F03, 1);
+	OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x0000003a);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1);
@@ -629,7 +647,7 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000012);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E05, 1);
+	OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1);
@@ -752,9 +770,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1);
 	OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 
-	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
-	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(0xf));
-
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR);
 
@@ -763,3 +778,11 @@ fd4_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd4_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd4_emit_const;
+	ctx->emit_const_bo = fd4_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 7d059f8e532..ab7850e50b0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -37,15 +37,13 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
 void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
-		struct pipe_surface *psurf);
+		unsigned nr_bufs, struct pipe_surface **bufs);
 
 /* grouped together emit-state for prog/vertex/state emit: */
 struct fd4_emit {
@@ -53,10 +51,12 @@ struct fd4_emit {
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	enum a4xx_color_fmt format;
-	enum pipe_format pformat;
 	uint32_t dirty;
 
+	uint32_t sprite_coord_enable;  /* bitmask */
+	bool sprite_coord_mode;
+	bool rasterflat;
+
 	/* cached to avoid repeated lookups of same variants: */
 	struct ir3_shader_variant *vp, *fp;
 	/* TODO: other shader stages.. */
@@ -96,4 +96,6 @@ void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd4_emit_restore(struct fd_context *ctx);
 
+void fd4_emit_init(struct pipe_context *pctx);
+
 #endif /* FD4_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 29abe0b0cc3..3e0045449eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -89,6 +89,14 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(L8_UNORM,   8_UNORM, R8_UNORM, WZYX),
 	_T(I8_UNORM,   8_UNORM, NONE,     WZYX),
 
+	/* NOTE: should be TFMT_8_UINT (which then gets remapped to
+	 * TFMT_8_UNORM for mem2gmem in _gmem_restore_format()), but
+	 * we don't know TFMT_8_UINT yet.. so just use TFMT_8_UNORM
+	 * for now.. sampling from stencil as a texture might not
+	 * work right, but at least should be fine for zsbuf..
+	 */
+	_T(S8_UINT,    8_UNORM,  R8_UNORM, WZYX),
+
 	/* 16-bit */
 	V_(R16_UNORM,   16_UNORM, NONE,     WZYX),
 	V_(R16_SNORM,   16_SNORM, NONE,     WZYX),
@@ -96,7 +104,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16_SINT,    16_SINT,  R16_SINT, WZYX),
 	V_(R16_USCALED, 16_UINT,  NONE,     WZYX),
 	V_(R16_SSCALED, 16_UINT,  NONE,     WZYX),
-	VT(R16_FLOAT,   16_FLOAT, NONE,     WZYX),
+	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT,WZYX),
 
 	_T(A16_UINT,    16_UINT,  NONE,     WZYX),
 	_T(A16_SINT,    16_SINT,  NONE,     WZYX),
@@ -132,7 +140,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32_SINT,    32_SINT,  R32_SINT, WZYX),
 	V_(R32_USCALED, 32_UINT,  NONE,     WZYX),
 	V_(R32_SSCALED, 32_UINT,  NONE,     WZYX),
-	VT(R32_FLOAT,   32_FLOAT, NONE,     WZYX),
+	VT(R32_FLOAT,   32_FLOAT, R32_FLOAT,WZYX),
 	V_(R32_FIXED,   32_FIXED, NONE,     WZYX),
 
 	_T(A32_UINT,    32_UINT,  NONE,     WZYX),
@@ -148,7 +156,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT, WZYX),
 	V_(R16G16_USCALED, 16_16_UINT,  NONE,        WZYX),
 	V_(R16G16_SSCALED, 16_16_SINT,  NONE,        WZYX),
-	VT(R16G16_FLOAT,   16_16_FLOAT, NONE,        WZYX),
+	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT,WZYX),
 
 	_T(L16A16_UINT,    16_16_UINT,  NONE,        WZYX),
 	_T(L16A16_SINT,    16_16_SINT,  NONE,        WZYX),
@@ -191,7 +199,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 
 	_T(Z24X8_UNORM,       X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(Z24_UNORM_S8_UINT, X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
-	/*_T(Z32_FLOAT,         Z32_FLOAT,   R8G8B8A8_UNORM, WZYX),*/
+	_T(Z32_FLOAT,         32_FLOAT,   R8G8B8A8_UNORM, WZYX),
+	_T(Z32_FLOAT_S8X24_UINT, 32_FLOAT,R8G8B8A8_UNORM, WZYX),
 
 	/* 48-bit */
 	V_(R16G16B16_UNORM,   16_16_16_UNORM, NONE, WZYX),
@@ -218,7 +227,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32G32_SINT,    32_32_SINT,  R32G32_SINT, WZYX),
 	V_(R32G32_USCALED, 32_32_UINT,  NONE,        WZYX),
 	V_(R32G32_SSCALED, 32_32_SINT,  NONE,        WZYX),
-	VT(R32G32_FLOAT,   32_32_FLOAT, NONE,        WZYX),
+	VT(R32G32_FLOAT,   32_32_FLOAT, R32G32_FLOAT,WZYX),
 	V_(R32G32_FIXED,   32_32_FIXED, NONE,        WZYX),
 
 	_T(L32A32_UINT,    32_32_UINT,  NONE,        WZYX),
@@ -282,6 +291,9 @@ fd4_pipe2swap(enum pipe_format format)
 enum a4xx_tex_fetchsize
 fd4_pipe2fetchsize(enum pipe_format format)
 {
+	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+		format = PIPE_FORMAT_Z32_FLOAT;
+
 	switch (util_format_get_blocksizebits(format)) {
 	case 8:   return TFETCH4_1_BYTE;
 	case 16:  return TFETCH4_2_BYTE;
@@ -312,6 +324,8 @@ fd4_gmem_restore_format(enum pipe_format format)
 		return PIPE_FORMAT_R8G8B8A8_UNORM;
 	case PIPE_FORMAT_Z16_UNORM:
 		return PIPE_FORMAT_R8G8_UNORM;
+	case PIPE_FORMAT_S8_UINT:
+		return PIPE_FORMAT_R8_UNORM;
 	default:
 		return format;
 	}
@@ -328,6 +342,9 @@ fd4_pipe2depth(enum pipe_format format)
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 		return DEPTH4_24_8;
+	case PIPE_FORMAT_Z32_FLOAT:
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		return DEPTH4_32;
 	default:
 		return ~0;
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 9a905062071..81c37f72565 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -44,12 +44,6 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
-static const struct ir3_shader_key key = {
-		// XXX should set this based on render target format!  We don't
-		// want half_precision if float32 render target!!!
-		.half_precision = true,
-};
-
 static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
@@ -63,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = TILE4_LINEAR;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		enum a4xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
 		struct fd_resource *rsc = NULL;
@@ -74,11 +68,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
+			enum pipe_format pformat = 0;
 
 			rsc = fd_resource(psurf->texture);
+			pformat = psurf->format;
+
+			/* In case we're drawing to Z32F_S8, the "color" actually goes to
+			 * the stencil
+			 */
+			if (rsc->stencil) {
+				rsc = rsc->stencil;
+				pformat = rsc->base.b.format;
+				bases++;
+			}
+
 			slice = fd_resource_slice(rsc, psurf->u.tex.level);
-			format = fd4_pipe2color(psurf->format);
-			swap = fd4_pipe2swap(psurf->format);
+			format = fd4_pipe2color(pformat);
+			swap = fd4_pipe2swap(pformat);
 
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
@@ -94,6 +100,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			} else {
 				stride = slice->pitch * rsc->cpp;
 			}
+		} else if ((i < nr_bufs) && bases) {
+			base = bases[i];
 		}
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3);
@@ -101,7 +109,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
-		if (bin_w || (i >= nr_bufs)) {
+		if (bin_w || (i >= nr_bufs) || !bufs[i]) {
 			OUT_RING(ring, base);
 			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
 		} else {
@@ -115,30 +123,26 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	}
 }
 
-static uint32_t
-depth_base(struct fd_context *ctx)
-{
-	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
-	uint32_t cpp = 4;
-	if (pfb->cbufs[0]) {
-		struct fd_resource *rsc =
-				fd_resource(pfb->cbufs[0]->texture);
-		cpp = rsc->cpp;
-	}
-	return align(gmem->bin_w * gmem->bin_h * cpp, 0x4000);
-}
-
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
-emit_gmem2mem_surf(struct fd_context *ctx,
+emit_gmem2mem_surf(struct fd_context *ctx, bool stencil,
 		uint32_t base, struct pipe_surface *psurf)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_resource *rsc = fd_resource(psurf->texture);
-	struct fd_resource_slice *slice = &rsc->slices[psurf->u.tex.level];
-	uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level,
+	enum pipe_format pformat = psurf->format;
+	struct fd_resource_slice *slice;
+	uint32_t offset;
+
+	if (stencil) {
+		debug_assert(rsc->stencil);
+		rsc = rsc->stencil;
+		pformat = rsc->base.b.format;
+	}
+
+	slice = &rsc->slices[psurf->u.tex.level];
+	offset = fd_resource_offset(rsc, psurf->u.tex.level,
 			psurf->u.tex.first_layer);
 
 	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
@@ -150,10 +154,10 @@ emit_gmem2mem_surf(struct fd_context *ctx,
 	OUT_RELOCW(ring, rsc->bo, offset, 0, 0);   /* RB_COPY_DEST_BASE */
 	OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp));
 	OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) |
-			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(psurf->format)) |
+			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) |
 			A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) |
 			A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) |
-			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(psurf->format)));
+			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat)));
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -163,13 +167,15 @@ static void
 fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
+	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->solid_vbuf_state,
 			.prog = &ctx->solid_prog,
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = true,
+			},
 	};
 
 	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
@@ -238,16 +244,26 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
+	fd4_program_emit(ring, &emit, 0, NULL);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
-		uint32_t base = depth_base(ctx);
-		emit_gmem2mem_surf(ctx, base, pfb->zsbuf);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		if (!rsc->stencil || (ctx->resolve & FD_BUFFER_DEPTH))
+			emit_gmem2mem_surf(ctx, false, ctx->gmem.zsbuf_base[0], pfb->zsbuf);
+		if (rsc->stencil && (ctx->resolve & FD_BUFFER_STENCIL))
+			emit_gmem2mem_surf(ctx, true, ctx->gmem.zsbuf_base[1], pfb->zsbuf);
 	}
 
 	if (ctx->resolve & FD_BUFFER_COLOR) {
-		emit_gmem2mem_surf(ctx, 0, pfb->cbufs[0]);
+		unsigned i;
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			if (!pfb->cbufs[i])
+				continue;
+			if (!(ctx->resolve & (PIPE_CLEAR_COLOR0 << i)))
+				continue;
+			emit_gmem2mem_surf(ctx, false, gmem->cbuf_base[i], pfb->cbufs[i]);
+		}
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
@@ -260,14 +276,25 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 /* transfer from system memory to gmem */
 
 static void
-emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
-		struct pipe_surface *psurf, uint32_t bin_w)
+emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
+		struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
+	struct pipe_surface *zsbufs[2];
+
+	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
+
+	if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
+		/* The gmem_restore_tex logic will put the first buffer's stencil
+		 * as color. Supply it with the proper information to make that
+		 * happen.
+		 */
+		zsbufs[0] = zsbufs[1] = bufs[0];
+		bufs = zsbufs;
+		nr_bufs = 2;
+	}
 
-	emit_mrt(ring, 1, &psurf, &base, bin_w);
-
-	fd4_emit_gmem_restore_tex(ring, psurf);
+	fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs);
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -282,10 +309,14 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->blit_vbuf_state,
+			.sprite_coord_enable = 1,
+			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = fd_half_precision(pfb),
+			},
 	};
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	float x0, y0, x1, y1;
 	unsigned bin_w = tile->bin_w;
 	unsigned bin_h = tile->bin_h;
@@ -304,7 +335,9 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, fui(x1));
 	OUT_RING(ring, fui(y1));
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
@@ -319,6 +352,16 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
 	OUT_RING(ring, 0x8);          /* XXX RB_RENDER_CONTROL */
 
@@ -381,7 +424,6 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	/* for gmem pitch/base calculations, we need to use the non-
@@ -390,11 +432,46 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	bin_w = gmem->bin_w;
 	bin_h = gmem->bin_h;
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
-		emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
+		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
+		fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
+		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
+	}
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
-		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
+		switch (pfb->zsbuf->format) {
+		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		case PIPE_FORMAT_Z32_FLOAT:
+			emit.prog = (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) ?
+					&ctx->blit_z : &ctx->blit_zs;
+			emit.key.half_precision = false;
+
+			OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
+			OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) |
+					A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
+			OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1);
+			OUT_RING(ring, 0x80000);   /* GRAS_CL_CLIP_CNTL */
+
+			break;
+		default:
+			/* Non-float can use a regular color write. It's split over 8-bit
+			 * components, so half precision is always sufficient.
+			 */
+			emit.prog = &ctx->blit_prog[0];
+			emit.key.half_precision = true;
+			break;
+		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
+		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
+		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
+	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
 	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
@@ -534,21 +611,35 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	uint32_t reg;
 
-	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
-	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(ctx));
 	if (pfb->zsbuf) {
-		reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format));
-	}
-	OUT_RING(ring, reg);
-	if (pfb->zsbuf) {
-		uint32_t cpp = util_format_get_blocksize(pfb->zsbuf->format);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		uint32_t cpp = rsc->cpp;
+
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
+		OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) |
+				A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format)));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w));
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		if (rsc->stencil) {
+			OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL |
+					A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1]));
+			OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w));
+		} else {
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
 	} else {
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
+		OUT_RING(ring, 0x00000000);
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		OUT_RING(ring, 0);            /* RB_STENCIL_INFO */
+		OUT_RING(ring, 0);            /* RB_STENCIL_PITCH */
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1);
@@ -586,7 +677,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
 	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, gmem->bin_w);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index e8f5837f7ce..1a6d0142132 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -31,8 +31,6 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_parse.h"
 
 #include "freedreno_program.h"
 
@@ -53,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
@@ -213,14 +211,17 @@ setup_stages(struct fd4_emit *emit, struct stage *s)
 }
 
 void
-fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
+fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs)
 {
 	struct stage s[MAX_STAGES];
-	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
+	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	setup_stages(emit, s);
 
 	/* blob seems to always use constmode currently: */
@@ -232,11 +233,30 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
 	psize_regid = ir3_find_output_regid(s[VS].v,
 		ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
-	color_regid = ir3_find_output_regid(s[FS].v,
-		ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	if (s[FS].v->color0_mrt) {
+		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
+		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
+			ir3_find_output_regid(s[FS].v, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	} else {
+		const struct ir3_shader_variant *fp = s[FS].v;
+		memset(color_regid, 0, sizeof(color_regid));
+		for (i = 0; i < fp->outputs_count; i++) {
+			ir3_semantic sem = fp->outputs[i].semantic;
+			unsigned idx = sem2idx(sem);
+			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
+				continue;
+			debug_assert(idx < ARRAY_SIZE(color_regid));
+			color_regid[idx] = fp->outputs[i].regid;
+		}
+	}
+
+	/* adjust regids for alpha output formats. there is no alpha render
+	 * format, so it's just treated like red
+	 */
+	for (i = 0; i < nr; i++)
+		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
+			color_regid[i] += 3;
 
-	if (util_format_is_alpha(emit->pformat))
-		color_regid += 3;
 
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
@@ -419,29 +439,24 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 					A4XX_RB_RENDER_CONTROL2_WCOORD));
 
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
-	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(1) |
+	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
 			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
-	if (s[FS].v->writes_pos) {
-		OUT_RING(ring, 0x00000001 |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
-	} else {
-		OUT_RING(ring, 0x00000001);
-	}
+	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
+			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
+			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid) |
-			A4XX_SP_FS_MRT_REG_MRTFORMAT(emit->format) |
-			COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
+	for (i = 0; i < 8; i++) {
+		enum a4xx_color_fmt format = 0;
+		if (i < nr)
+			format = fd4_emit_format(bufs[i]);
+		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
+				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+				COND(emit->key.half_precision,
+					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
+	}
 
 	if (emit->key.binning_pass) {
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
@@ -450,10 +465,10 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
 		OUT_RING(ring, 0x00000000);
 	} else {
-		uint32_t vinterp[8], flatshade[2];
+		uint32_t vinterp[8], vpsrepl[8];
 
 		memset(vinterp, 0, sizeof(vinterp));
-		memset(flatshade, 0, sizeof(flatshade));
+		memset(vpsrepl, 0, sizeof(vpsrepl));
 
 		/* looks like we need to do int varyings in the frag
 		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
@@ -470,29 +485,40 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		 * something like the code below instead of workaround
 		 * in the shader:
 		 */
-#if 0
-		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
+		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
 		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
 			uint32_t interp = s[FS].v->inputs[j].interpolate;
+
+			/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
+			 * instead.. rather than -8 everywhere else..
+			 */
+			uint32_t inloc = s[FS].v->inputs[j].inloc - 8;
+
+			/* currently assuming varyings aligned to 4 (not
+			 * packed):
+			 */
+			debug_assert((inloc % 4) == 0);
+
 			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
 					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
-				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
-				 * instead.. rather than -8 everywhere else..
-				 */
-				uint32_t loc = s[FS].v->inputs[j].inloc - 8;
-
-				/* currently assuming varyings aligned to 4 (not
-				 * packed):
-				 */
-				debug_assert((loc % 4) == 0);
+				uint32_t loc = inloc;
 
 				for (i = 0; i < 4; i++, loc++) {
 					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
-					flatshade[loc / 32] |= 1 << (loc % 32);
+					//flatshade[loc / 32] |= 1 << (loc % 32);
 				}
 			}
+
+			/* Replace the .xy coordinates with S/T from the point sprite. Set
+			 * interpolation bits for .zw such that they become .01
+			 */
+			if (emit->sprite_coord_enable & (1 << sem2idx(s[FS].v->inputs[j].semantic))) {
+				vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
+					<< ((inloc % 16) * 2);
+				vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
+				vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
+			}
 		}
-#endif
 
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
 		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
@@ -509,7 +535,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 
 		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
 		for (i = 0; i < 8; i++)
-			OUT_RING(ring, s[FS].v->shader->vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
+			OUT_RING(ring, vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
 	}
 
 	if (s[VS].instrlen)
@@ -520,19 +546,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			emit_shader(ring, s[FS].v);
 }
 
-/* hack.. until we figure out how to deal w/ vpsrepl properly.. */
-static void
-fix_blit_fp(struct pipe_context *pctx)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd4_shader_stateobj *so = ctx->blit_prog[0].fp;
-
-	so->shader->vpsrepl[0] = 0x99999999;
-	so->shader->vpsrepl[1] = 0x99999999;
-	so->shader->vpsrepl[2] = 0x99999999;
-	so->shader->vpsrepl[3] = 0x99999999;
-}
-
 void
 fd4_prog_init(struct pipe_context *pctx)
 {
@@ -543,6 +556,4 @@ fd4_prog_init(struct pipe_context *pctx)
 	pctx->delete_vs_state = fd4_vp_state_delete;
 
 	fd_prog_init(pctx);
-
-	fix_blit_fp(pctx);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
index 52306a4c60d..8dfccaf9d74 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -39,7 +39,8 @@ struct fd4_shader_stateobj {
 
 struct fd4_emit;
 
-void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit);
+void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs);
 
 void fd4_prog_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 6db1c11b94b..4f69e0c1694 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -31,9 +31,93 @@
 #include "freedreno_util.h"
 
 #include "fd4_query.h"
+#include "fd4_draw.h"
 #include "fd4_format.h"
 
+
+struct fd_rb_samp_ctrs {
+	uint64_t ctr[16];
+};
+
+/*
+ * Occlusion Query:
+ *
+ * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
+ * interpret results
+ */
+
+static struct fd_hw_sample *
+occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	struct fd_hw_sample *samp =
+			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+
+	/* low bits of sample addr should be zero (since they are control
+	 * flags in RB_SAMPLE_COUNT_CONTROL):
+	 */
+	debug_assert((samp->offset & 0x3) == 0);
+
+	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
+	 * HW_QUERY_BASE_REG register:
+	 */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
+	OUT_RING(ring, HW_QUERY_BASE_REG);
+	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
+			samp->offset);
+
+	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
+	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
+						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
+	OUT_RING(ring, 1);             /* NumInstances */
+	OUT_RING(ring, 0);             /* NumIndices */
+
+	fd_event_write(ctx, ring, ZPASS_DONE);
+
+	return samp;
+}
+
+static uint64_t
+count_samples(const struct fd_rb_samp_ctrs *start,
+		const struct fd_rb_samp_ctrs *end)
+{
+	return end->ctr[0] - start->ctr[0];
+}
+
+static void
+occlusion_counter_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->u64 += n;
+}
+
+static void
+occlusion_predicate_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->b |= (n > 0);
+}
+
+static const struct fd_hw_sample_provider occlusion_counter = {
+		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_counter_accumulate_result,
+};
+
+static const struct fd_hw_sample_provider occlusion_predicate = {
+		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_predicate_accumulate_result,
+};
+
 void fd4_query_context_init(struct pipe_context *pctx)
 {
-	/* TODO */
+	fd_hw_query_register_provider(pctx, &occlusion_counter);
+	fd_hw_query_register_provider(pctx, &occlusion_predicate);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
index e54b606a285..dc7e98b149d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
@@ -50,7 +50,7 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 
 	if (cso->point_size_per_vertex) {
 		psize_min = util_get_min_point_size(cso);
-		psize_max = 8192;
+		psize_max = 4092;
 	} else {
 		/* Force the point size to be as if the vertex output was disabled. */
 		psize_min = cso->point_size;
@@ -67,9 +67,9 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 */
 	so->gras_cl_clip_cntl = 0x80000; /* ??? */
 	so->gras_su_point_minmax =
-			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min/2) |
-			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max/2);
-	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size/2);
+			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) |
+			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max);
+	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size);
 	so->gras_su_poly_offset_scale =
 			A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
index 06c728f2f1f..64e81a9983b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
@@ -44,7 +44,7 @@ struct fd4_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd4_rasterizer_stateobj *
+static inline struct fd4_rasterizer_stateobj *
 fd4_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd4_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index e8cbb2d201a..d8ea414f300 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -102,7 +102,7 @@ void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 1;
+	screen->max_rts = A4XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index 6ba25d0816d..d2bc5fee6c0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -150,8 +150,8 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 
 	if (!so)
 		return NULL;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
index 579ed87f14b..84ee7ecb50c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -42,7 +42,7 @@ struct fd4_sampler_stateobj {
 	uint32_t texsamp0, texsamp1;
 };
 
-static INLINE struct fd4_sampler_stateobj *
+static inline struct fd4_sampler_stateobj *
 fd4_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd4_sampler_stateobj *)samp;
@@ -53,7 +53,7 @@ struct fd4_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
 };
 
-static INLINE struct fd4_pipe_sampler_view *
+static inline struct fd4_pipe_sampler_view *
 fd4_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd4_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
index 033317cf620..6a92a9b6785 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
@@ -47,7 +47,7 @@ struct fd4_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd4_zsa_stateobj *
+static inline struct fd4_zsa_stateobj *
 fd4_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd4_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index b23aa830770..00b6acba065 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
-
-Copyright (C) 2013-2014 by the following authors:
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 2b24c5b4e78..98a90e26679 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
@@ -67,7 +67,7 @@ enum vgt_event_type {
 
 enum pc_di_primtype {
 	DI_PT_NONE = 0,
-	DI_PT_POINTLIST_A2XX = 1,
+	DI_PT_POINTLIST_PSIZE = 1,
 	DI_PT_LINELIST = 2,
 	DI_PT_LINESTRIP = 3,
 	DI_PT_TRILIST = 4,
@@ -75,7 +75,7 @@ enum pc_di_primtype {
 	DI_PT_TRISTRIP = 6,
 	DI_PT_LINELOOP = 7,
 	DI_PT_RECTLIST = 8,
-	DI_PT_POINTLIST_A3XX = 9,
+	DI_PT_POINTLIST = 9,
 	DI_PT_LINE_ADJ = 10,
 	DI_PT_LINESTRIP_ADJ = 11,
 	DI_PT_TRI_ADJ = 12,
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 668ef3629bf..8e6d43150ce 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -94,9 +94,7 @@ void
 fd_context_render(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_resource *rsc, *rsc_tmp;
-	int i;
 
 	DBG("needs_flush: %d", ctx->needs_flush);
 
@@ -118,20 +116,11 @@ fd_context_render(struct pipe_context *pctx)
 	ctx->gmem_reason = 0;
 	ctx->num_draws = 0;
 
-	for (i = 0; i < pfb->nr_cbufs; i++)
-		if (pfb->cbufs[i])
-			fd_resource(pfb->cbufs[i]->texture)->dirty = false;
-	if (pfb->zsbuf) {
-		rsc = fd_resource(pfb->zsbuf->texture);
-		rsc->dirty = false;
-		if (rsc->stencil)
-			rsc->stencil->dirty = false;
-	}
-
 	/* go through all the used resources and clear their reading flag */
 	LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) {
-		assert(rsc->reading);
-		rsc->reading = false;
+		debug_assert(rsc->status != 0);
+		rsc->status = 0;
+		rsc->pending_ctx = NULL;
 		list_delinit(&rsc->list);
 	}
 
@@ -144,8 +133,10 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 {
 	fd_context_render(pctx);
 
-	if (fence)
+	if (fence) {
+		fd_screen_fence_ref(pctx->screen, fence, NULL);
 		*fence = fd_fence_create(pctx);
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index e420f1e5bd9..509a90fdf23 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -82,6 +82,20 @@ struct fd_vertex_stateobj {
 	unsigned num_elements;
 };
 
+struct fd_streamout_stateobj {
+	struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+	unsigned num_targets;
+	/* Track offset from vtxcnt for streamout data.  This counter
+	 * is just incremented by # of vertices on each draw until
+	 * reset or new streamout buffer bound.
+	 *
+	 * When we eventually have GS, the CPU won't actually know the
+	 * number of vertices per draw, so I think we'll have to do
+	 * something more clever.
+	 */
+	unsigned offsets[PIPE_MAX_SO_BUFFERS];
+};
+
 /* group together the vertex and vertexbuf state.. for ease of passing
  * around, and because various internal operations (gmem<->mem, etc)
  * need their own vertex state:
@@ -179,7 +193,7 @@ struct fd_context {
 	struct fd_program_stateobj solid_prog; // TODO move to screen?
 
 	/* shaders used by mem->gmem blits: */
-	struct fd_program_stateobj blit_prog[8]; // TODO move to screen?
+	struct fd_program_stateobj blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen?
 	struct fd_program_stateobj blit_z, blit_zs;
 
 	/* do we need to mem2gmem before rendering.  We don't, if for example,
@@ -319,6 +333,7 @@ struct fd_context {
 		FD_DIRTY_VTXBUF      = (1 << 15),
 		FD_DIRTY_INDEXBUF    = (1 << 16),
 		FD_DIRTY_SCISSOR     = (1 << 17),
+		FD_DIRTY_STREAMOUT   = (1 << 18),
 	} dirty;
 
 	struct pipe_blend_state *blend;
@@ -339,6 +354,7 @@ struct fd_context {
 	struct pipe_viewport_state viewport;
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
 	struct pipe_index_buffer indexbuf;
+	struct fd_streamout_stateobj streamout;
 
 	/* GMEM/tile handling fxns: */
 	void (*emit_tile_init)(struct fd_context *ctx);
@@ -351,18 +367,25 @@ struct fd_context {
 	void (*emit_sysmem_prep)(struct fd_context *ctx);
 
 	/* draw: */
-	void (*draw_vbo)(struct fd_context *pctx, const struct pipe_draw_info *info);
+	void (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info);
 	void (*clear)(struct fd_context *ctx, unsigned buffers,
 			const union pipe_color_union *color, double depth, unsigned stencil);
+
+	/* constant emit:  (note currently not used/needed for a2xx) */
+	void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type,
+			uint32_t regid, uint32_t offset, uint32_t sizedwords,
+			const uint32_t *dwords, struct pipe_resource *prsc);
+	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
 };
 
-static INLINE struct fd_context *
+static inline struct fd_context *
 fd_context(struct pipe_context *pctx)
 {
 	return (struct fd_context *)pctx;
 }
 
-static INLINE struct pipe_scissor_state *
+static inline struct pipe_scissor_state *
 fd_context_get_scissor(struct fd_context *ctx)
 {
 	if (ctx->rasterizer && ctx->rasterizer->scissor)
@@ -370,13 +393,13 @@ fd_context_get_scissor(struct fd_context *ctx)
 	return &ctx->disabled_scissor;
 }
 
-static INLINE bool
+static inline bool
 fd_supported_prim(struct fd_context *ctx, unsigned prim)
 {
 	return (1 << prim) & ctx->primtype_mask;
 }
 
-static INLINE void
+static inline void
 fd_reset_wfi(struct fd_context *ctx)
 {
 	ctx->needs_wfi = true;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index c9e317c7dc9..6831a58749c 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,7 +40,8 @@
 #include "freedreno_util.h"
 
 static void
-resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
+resource_used(struct fd_context *ctx, struct pipe_resource *prsc,
+		enum fd_resource_status status)
 {
 	struct fd_resource *rsc;
 
@@ -48,9 +49,29 @@ resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
 		return;
 
 	rsc = fd_resource(prsc);
-	rsc->reading = true;
+	rsc->status |= status;
+	if (rsc->stencil)
+		rsc->stencil->status |= status;
+
+	/* TODO resources can actually be shared across contexts,
+	 * so I'm not sure a single list-head will do the trick?
+	 */
+	debug_assert((rsc->pending_ctx == ctx) || !rsc->pending_ctx);
 	list_delinit(&rsc->list);
 	list_addtail(&rsc->list, &ctx->used_resources);
+	rsc->pending_ctx = ctx;
+}
+
+static void
+resource_read(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_READ);
+}
+
+static void
+resource_written(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_WRITE);
 }
 
 static void
@@ -59,7 +80,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
-	unsigned i, buffers = 0;
+	unsigned i, prims, buffers = 0;
 
 	/* if we supported transform feedback, we'd have to disable this: */
 	if (((scissor->maxx - scissor->minx) *
@@ -69,6 +90,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* emulate unsupported primitives: */
 	if (!fd_supported_prim(ctx, info->mode)) {
+		if (ctx->streamout.num_targets > 0)
+			debug_error("stream-out with emulated prims");
 		util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf);
 		util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
 		util_primconvert_draw_vbo(ctx->primconvert, info);
@@ -83,17 +106,13 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	if (fd_depth_enabled(ctx)) {
 		buffers |= FD_BUFFER_DEPTH;
-		fd_resource(pfb->zsbuf->texture)->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED;
 	}
 
 	if (fd_stencil_enabled(ctx)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
 		buffers |= FD_BUFFER_STENCIL;
-		if (rsc->stencil)
-			rsc->stencil->dirty = true;
-		else
-			rsc->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED;
 	}
 
@@ -108,7 +127,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 		surf = pfb->cbufs[i]->texture;
 
-		fd_resource(surf)->dirty = true;
+		resource_written(ctx, surf);
 		buffers |= PIPE_CLEAR_COLOR0 << i;
 
 		if (surf->nr_samples > 1)
@@ -120,32 +139,38 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* Skip over buffer 0, that is sent along with the command stream */
 	for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
 	}
 
 	/* Mark VBOs as being read */
 	for (i = 0; i < ctx->vtx.vertexbuf.count; i++) {
 		assert(!ctx->vtx.vertexbuf.vb[i].user_buffer);
-		resource_reading(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
+		resource_read(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
 	}
 
 	/* Mark index buffer as being read */
-	resource_reading(ctx, ctx->indexbuf.buffer);
+	resource_read(ctx, ctx->indexbuf.buffer);
 
 	/* Mark textures as being read */
 	for (i = 0; i < ctx->verttex.num_textures; i++)
 		if (ctx->verttex.textures[i])
-			resource_reading(ctx, ctx->verttex.textures[i]->texture);
+			resource_read(ctx, ctx->verttex.textures[i]->texture);
 	for (i = 0; i < ctx->fragtex.num_textures; i++)
 		if (ctx->fragtex.textures[i])
-			resource_reading(ctx, ctx->fragtex.textures[i]->texture);
+			resource_read(ctx, ctx->fragtex.textures[i]->texture);
+
+	/* Mark streamout buffers as being written.. */
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		if (ctx->streamout.targets[i])
+			resource_written(ctx, ctx->streamout.targets[i]->buffer);
 
 	ctx->num_draws++;
 
+	prims = u_reduced_prims_for_vertices(info->mode, info->count);
+
 	ctx->stats.draw_calls++;
-	ctx->stats.prims_emitted +=
-		u_reduced_prims_for_vertices(info->mode, info->count);
+	ctx->stats.prims_emitted += prims;
 
 	/* any buffers that haven't been cleared yet, we need to restore: */
 	ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
@@ -159,6 +184,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
 	ctx->draw_vbo(ctx, info);
 
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		ctx->streamout.offsets[i] += prims;
+
 	/* if an app (or, well, piglit test) does many thousands of draws
 	 * without flush (or anything which implicitly flushes, like
 	 * changing render targets), we can exceed the ringbuffer size.
@@ -216,15 +244,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR)
 		for (i = 0; i < pfb->nr_cbufs; i++)
 			if (buffers & (PIPE_CLEAR_COLOR0 << i))
-				fd_resource(pfb->cbufs[i]->texture)->dirty = true;
+				resource_written(ctx, pfb->cbufs[i]->texture);
 
 	if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
-		if (rsc->stencil && buffers & PIPE_CLEAR_STENCIL)
-			rsc->stencil->dirty = true;
-		if (!rsc->stencil || buffers & PIPE_CLEAR_DEPTH)
-			rsc->dirty = true;
-
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL;
 	}
 
@@ -242,7 +265,8 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 			FD_DIRTY_SAMPLE_MASK |
 			FD_DIRTY_PROG |
 			FD_DIRTY_CONSTBUF |
-			FD_DIRTY_BLEND;
+			FD_DIRTY_BLEND |
+			FD_DIRTY_FRAMEBUFFER;
 
 	if (fd_mesa_debug & FD_DBG_DCLEAR)
 		ctx->dirty = 0xffffffff;
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 375e58f7022..04a9feacd58 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -69,6 +69,9 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
+	if (!timeout)
+		return fd_screen_fence_signalled(screen, fence);
+
 	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
 		return false;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index c105378ec4e..648db9baee5 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -82,7 +82,7 @@ total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2],
 {
 	uint32_t total = 0, i;
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < MAX_RENDER_TARGETS; i++) {
 		if (cbuf_cpp[i]) {
 			gmem->cbuf_base[i] = align(total, 0x4000);
 			total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h;
@@ -113,7 +113,7 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t nbins_x = 1, nbins_y = 1;
 	uint32_t bin_w, bin_h;
 	uint32_t max_width = bin_width(ctx);
-	uint8_t cbuf_cpp[4] = {0}, zsbuf_cpp[2] = {0};
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0};
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
@@ -162,12 +162,17 @@ calculate_tiles(struct fd_context *ctx)
 		bin_w = align(width / nbins_x, 32);
 	}
 
+	if (fd_mesa_debug & FD_DBG_MSGS) {
+		debug_printf("binning input: cbuf cpp:");
+		for (i = 0; i < pfb->nr_cbufs; i++)
+			debug_printf(" %d", cbuf_cpp[i]);
+		debug_printf(", zsbuf cpp: %d; %dx%d\n",
+				zsbuf_cpp[0], width, height);
+	}
+
 	/* then find a bin width/height that satisfies the memory
 	 * constraints:
 	 */
-	DBG("binning input: cbuf cpp: %d %d %d %d, zsbuf cpp: %d; %dx%d",
-		cbuf_cpp[0], cbuf_cpp[1], cbuf_cpp[2], cbuf_cpp[3], zsbuf_cpp[0],
-		width, height);
 	while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) {
 		if (bin_w > bin_h) {
 			nbins_x++;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index 5867235db90..38b557eb077 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -31,6 +31,8 @@
 
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 /* per-pipe configuration for hw binning: */
 struct fd_vsc_pipe {
 	struct fd_bo *bo;
@@ -47,9 +49,9 @@ struct fd_tile {
 
 struct fd_gmem_stateobj {
 	struct pipe_scissor_state scissor;
-	uint32_t cbuf_base[4];
+	uint32_t cbuf_base[MAX_RENDER_TARGETS];
 	uint32_t zsbuf_base[2];
-	uint8_t cbuf_cpp[4];
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS];
 	uint8_t zsbuf_cpp[2];
 	uint16_t bin_h, nbins_y;
 	uint16_t bin_w, nbins_x;
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c
index 5e344e69146..e6a647852a3 100644
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -96,7 +96,11 @@ fd_prog_blit(struct pipe_context *pctx, int rts, bool depth)
 {
 	int i;
 	struct ureg_src tc;
-	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+	struct ureg_program *ureg;
+
+	debug_assert(rts <= MAX_RENDER_TARGETS);
+
+	ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
 	if (!ureg)
 		return NULL;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 95f79df565e..709ad4eb55b 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -42,6 +42,14 @@
 
 #include <errno.h>
 
+
+static bool
+pending(struct fd_resource *rsc, enum fd_resource_status status)
+{
+	return (rsc->status & status) ||
+		(rsc->stencil && (rsc->stencil->status & status));
+}
+
 static void
 fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 {
@@ -72,11 +80,11 @@ fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 
 	/* Textures */
 	for (i = 0; i < ctx->verttex.num_textures && !(ctx->dirty & FD_DIRTY_VERTTEX); i++) {
-		if (ctx->verttex.textures[i]->texture == prsc)
+		if (ctx->verttex.textures[i] && (ctx->verttex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_VERTTEX;
 	}
 	for (i = 0; i < ctx->fragtex.num_textures && !(ctx->dirty & FD_DIRTY_FRAGTEX); i++) {
-		if (ctx->fragtex.textures[i]->texture == prsc)
+		if (ctx->fragtex.textures[i] && (ctx->fragtex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_FRAGTEX;
 	}
 }
@@ -97,7 +105,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 
 	rsc->bo = fd_bo_new(screen->dev, size, flags);
 	rsc->timestamp = 0;
-	rsc->dirty = rsc->reading = false;
+	rsc->status = 0;
+	rsc->pending_ctx = NULL;
 	list_delinit(&rsc->list);
 	util_range_set_empty(&rsc->valid_buffer_range);
 }
@@ -238,8 +247,9 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 		/* If the GPU is writing to the resource, or if it is reading from the
 		 * resource and we're trying to write to it, flush the renders.
 		 */
-		if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty) ||
-			((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading))
+		if (((ptrans->usage & PIPE_TRANSFER_WRITE) &&
+					pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) ||
+				pending(rsc, FD_PENDING_WRITE))
 			fd_context_render(pctx);
 
 		/* The GPU keeps track of how the various bo's are being used, and
@@ -646,6 +656,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx)
 	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb);
 	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
 	util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp);
+	util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets,
+			ctx->streamout.targets);
 	util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer);
 	util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
 	util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
@@ -675,7 +687,7 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
 	struct fd_resource *rsc = fd_resource(prsc);
 
-	if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty))
+	if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ))
 		fd_context_render(pctx);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 0634923fcb2..7549becaa1f 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -60,6 +60,15 @@ struct fd_resource_slice {
 	uint32_t size0;          /* size of first layer in slice */
 };
 
+/* status of queued up but not flushed reads and write operations.
+ * In _transfer_map() we need to know if queued up rendering needs
+ * to be flushed to preserve the order of cpu and gpu access.
+ */
+enum fd_resource_status {
+	FD_PENDING_WRITE = 0x01,
+	FD_PENDING_READ  = 0x02,
+};
+
 struct fd_resource {
 	struct u_resource base;
 	struct fd_bo *bo;
@@ -68,17 +77,23 @@ struct fd_resource {
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
 	uint32_t timestamp;
-	bool dirty, reading;
 	/* buffer range that has been initialized */
 	struct util_range valid_buffer_range;
 
 	/* reference to the resource holding stencil data for a z32_s8 texture */
+	/* TODO rename to secondary or auxiliary? */
 	struct fd_resource *stencil;
 
+	/* pending read/write state: */
+	enum fd_resource_status status;
+	/* resources accessed by queued but not flushed draws are tracked
+	 * in the used_resources list.
+	 */
 	struct list_head list;
+	struct fd_context *pending_ctx;
 };
 
-static INLINE struct fd_resource *
+static inline struct fd_resource *
 fd_resource(struct pipe_resource *ptex)
 {
 	return (struct fd_resource *)ptex;
@@ -89,13 +104,13 @@ struct fd_transfer {
 	void *staging;
 };
 
-static INLINE struct fd_transfer *
+static inline struct fd_transfer *
 fd_transfer(struct pipe_transfer *ptrans)
 {
 	return (struct fd_transfer *)ptrans;
 }
 
-static INLINE struct fd_resource_slice *
+static inline struct fd_resource_slice *
 fd_resource_slice(struct fd_resource *rsc, unsigned level)
 {
 	assert(level <= rsc->base.b.last_level);
@@ -103,7 +118,7 @@ fd_resource_slice(struct fd_resource *rsc, unsigned level)
 }
 
 /* get offset for specified mipmap level and texture/array layer */
-static INLINE uint32_t
+static inline uint32_t
 fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer)
 {
 	struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b3b5462b437..b55f5b36ca9 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -68,7 +68,8 @@ static const struct debug_named_value debug_options[] = {
 		{"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"},
+		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
+		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -163,9 +164,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_CUBE_MAP_ARRAY:
-	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
@@ -175,10 +173,23 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-		return is_a3xx(screen) || is_a4xx(screen);
-
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
+	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+		return is_a3xx(screen) || is_a4xx(screen);
+
+	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+		/* ignoring first/last_element.. but I guess that should be
+		 * easy to add..
+		 */
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+		/* I think 32k on a4xx.. and we could possibly emulate more
+		 * by pretending 2d/rect textures and splitting high bits
+		 * of index into 2nd dimension..
+		 */
+		return 16383;
+
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen);
 
@@ -188,7 +199,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		if (glsl120)
 			return 120;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 130 : 120;
+		return is_ir3(screen) ? 130 : 120;
 
 	/* Unsupported features. */
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -218,6 +229,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -225,9 +240,17 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+		if (is_ir3(screen))
+			return PIPE_MAX_SO_BUFFERS;
+		return 0;
 	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+		if (is_ir3(screen))
+			return 1;
+		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
 	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+		if (is_ir3(screen))
+			return 16 * 4;   /* should only be shader out limit? */
 		return 0;
 
 	/* Geometry shader output, unsupported. */
@@ -258,9 +281,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_QUERY_TIMESTAMP:
 		return 0;
 	case PIPE_CAP_OCCLUSION_QUERY:
-		/* TODO still missing on a4xx, but we lie to get gl2..
-		 * it's not a feature, it's a bug!
-		 */
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
@@ -357,7 +377,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		 */
 		return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]);
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 16 : 1;
+		return is_ir3(screen) ? 16 : 1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
 		return 0; /* nothing uses this */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
@@ -379,7 +399,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_INTEGERS:
 		if (glsl120)
 			return 0;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 1 : 0;
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return 16;
@@ -546,7 +566,6 @@ fd_screen_create(struct fd_device *dev)
 	pscreen->get_timestamp = fd_screen_get_timestamp;
 
 	pscreen->fence_reference = fd_screen_fence_ref;
-	pscreen->fence_signalled = fd_screen_fence_signalled;
 	pscreen->fence_finish = fd_screen_fence_finish;
 
 	util_format_s3tc_init();
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index dbc2808262a..4e5c3a61958 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -56,7 +56,7 @@ struct fd_screen {
 	int64_t cpu_gpu_time_delta;
 };
 
-static INLINE struct fd_screen *
+static inline struct fd_screen *
 fd_screen(struct pipe_screen *pscreen)
 {
 	return (struct fd_screen *)pscreen;
@@ -73,6 +73,7 @@ struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen,
 struct pipe_screen * fd_screen_create(struct fd_device *dev);
 
 /* is a3xx patch revision 0? */
+/* TODO a306.0 probably doesn't need this.. be more clever?? */
 static inline boolean
 is_a3xx_p0(struct fd_screen *screen)
 {
@@ -91,4 +92,11 @@ is_a4xx(struct fd_screen *screen)
 	return (screen->gpu_id >= 400) && (screen->gpu_id < 500);
 }
 
+/* is it using the ir3 compiler (shader isa introduced with a3xx)? */
+static inline boolean
+is_ir3(struct fd_screen *screen)
+{
+	return is_a3xx(screen) || is_a4xx(screen);
+}
+
 #endif /* FREEDRENO_SCREEN_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 77aa4f21d3b..7bf8bdb4507 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
 	ctx->dirty |= FD_DIRTY_VTXSTATE;
 }
 
+static struct pipe_stream_output_target *
+fd_create_stream_output_target(struct pipe_context *pctx,
+		struct pipe_resource *prsc, unsigned buffer_offset,
+		unsigned buffer_size)
+{
+	struct pipe_stream_output_target *target;
+
+	target = CALLOC_STRUCT(pipe_stream_output_target);
+	if (!target)
+		return NULL;
+
+	pipe_reference_init(&target->reference, 1);
+	pipe_resource_reference(&target->buffer, prsc);
+
+	target->context = pctx;
+	target->buffer_offset = buffer_offset;
+	target->buffer_size = buffer_size;
+
+	return target;
+}
+
+static void
+fd_stream_output_target_destroy(struct pipe_context *pctx,
+		struct pipe_stream_output_target *target)
+{
+	pipe_resource_reference(&target->buffer, NULL);
+	FREE(target);
+}
+
+static void
+fd_set_stream_output_targets(struct pipe_context *pctx,
+		unsigned num_targets, struct pipe_stream_output_target **targets,
+		const unsigned *offsets)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	unsigned i;
+
+	debug_assert(num_targets <= ARRAY_SIZE(so->targets));
+
+	for (i = 0; i < num_targets; i++) {
+		boolean changed = targets[i] != so->targets[i];
+		boolean append = (offsets[i] == (unsigned)-1);
+
+		if (!changed && append)
+			continue;
+
+		so->offsets[i] = 0;
+
+		pipe_so_target_reference(&so->targets[i], targets[i]);
+	}
+
+	for (; i < so->num_targets; i++) {
+		pipe_so_target_reference(&so->targets[i], NULL);
+	}
+
+	so->num_targets = num_targets;
+
+	ctx->dirty |= FD_DIRTY_STREAMOUT;
+}
+
 void
 fd_state_init(struct pipe_context *pctx)
 {
@@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx)
 	pctx->create_vertex_elements_state = fd_vertex_state_create;
 	pctx->delete_vertex_elements_state = fd_vertex_state_delete;
 	pctx->bind_vertex_elements_state = fd_vertex_state_bind;
+
+	pctx->create_stream_output_target = fd_create_stream_output_target;
+	pctx->stream_output_target_destroy = fd_stream_output_target_destroy;
+	pctx->set_stream_output_targets = fd_set_stream_output_targets;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.c b/src/gallium/drivers/freedreno/freedreno_surface.c
index 250fe4bc0f5..70c44eb79c3 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.c
+++ b/src/gallium/drivers/freedreno/freedreno_surface.c
@@ -41,7 +41,8 @@ fd_create_surface(struct pipe_context *pctx,
 //	struct fd_resource* tex = fd_resource(ptex);
 	struct fd_surface* surface = CALLOC_STRUCT(fd_surface);
 
-	assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
+	debug_assert(ptex->target != PIPE_BUFFER);
+	debug_assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
 
 	if (surface) {
 		struct pipe_surface *psurf = &surface->base;
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.h b/src/gallium/drivers/freedreno/freedreno_surface.h
index 3293f33dd84..2de37cee2dd 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.h
+++ b/src/gallium/drivers/freedreno/freedreno_surface.h
@@ -40,7 +40,7 @@ struct fd_surface {
 	uint16_t depth;
 };
 
-static INLINE struct fd_surface *
+static inline struct fd_surface *
 fd_surface(struct pipe_surface *psurf)
 {
 	return (struct fd_surface *)psurf;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index deb0e602ce2..7129a1bddd1 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -40,6 +40,7 @@
 #include "util/u_dynarray.h"
 #include "util/u_pack_color.h"
 
+#include "disasm.h"
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 
@@ -53,6 +54,12 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 /* TBD if it is same on a2xx, but for now: */
 #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS
 
+#define A2XX_MAX_RENDER_TARGETS 1
+#define A3XX_MAX_RENDER_TARGETS 4
+#define A4XX_MAX_RENDER_TARGETS 8
+
+#define MAX_RENDER_TARGETS A4XX_MAX_RENDER_TARGETS
+
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002
 #define FD_DBG_DCLEAR   0x0004
@@ -64,6 +71,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBIN    0x0100
 #define FD_DBG_OPTMSGS  0x0200
 #define FD_DBG_GLSL120  0x0400
+#define FD_DBG_SHADERDB 0x0800
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -108,6 +116,58 @@ pipe_surface_format(struct pipe_surface *psurf)
 	return psurf->format;
 }
 
+static inline bool
+fd_surface_half_precision(const struct pipe_surface *psurf)
+{
+	enum pipe_format format;
+
+	if (!psurf)
+		return true;
+
+	format = psurf->format;
+
+	/* colors are provided in consts, which go through cov.f32f16, which will
+	 * break these values
+	 */
+	if (util_format_is_pure_integer(format))
+		return false;
+
+	/* avoid losing precision on 32-bit float formats */
+	if (util_format_is_float(format) &&
+		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
+		return false;
+
+	return true;
+}
+
+static inline unsigned
+fd_sampler_first_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.first_level;
+}
+
+static inline unsigned
+fd_sampler_last_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.last_level;
+}
+
+static inline bool
+fd_half_precision(struct pipe_framebuffer_state *pfb)
+{
+	unsigned i;
+
+	for (i = 0; i < pfb->nr_cbufs; i++)
+		if (!fd_surface_half_precision(pfb->cbufs[i]))
+			return false;
+
+	return true;
+}
+
 #define LOG_DWORDS 0
 
 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx);
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 48ae7c71b9f..83ed5ffdca0 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -103,7 +103,7 @@ static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
 	} else if ((reg.num == REG_P0) && !c) {
 		printf("p0.%c", component[reg.comp]);
 	} else {
-		printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+		printf("%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
 	}
 }
 
@@ -122,6 +122,32 @@ static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
 	print_reg(reg, full, r, c, im, neg, abs, addr_rel);
 }
 
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct reginfo *info)
+{
+	print_reg_src(info->reg, info->full, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct reginfo *info)
+//{
+//	print_reg_dst(info->reg, info->full, info->addr_rel);
+//}
+
 static void print_instr_cat0(instr_t *instr)
 {
 	instr_cat0_t *cat0 = &instr->cat0;
@@ -454,10 +480,70 @@ static void print_instr_cat6(instr_t *instr)
 {
 	instr_cat6_t *cat6 = &instr->cat6;
 	char sd = 0, ss = 0;  /* dst/src address space */
-	bool full = type_size(cat6->type) == 32;
 	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
 
-	printf(".%s ", type[cat6->type]);
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STGB_4D_4:
+	case OPC_STIB:
+		dst.full  = true;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (cat6->opc) {
+	case OPC_PREFETCH:
+	case OPC_RESINFO:
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		printf(".%c", ss);
+		printf(".%s", type[cat6->type]);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		printf(".%s", type[cat6->type]);
+		break;
+	}
+	printf(" ");
 
 	switch (cat6->opc) {
 	case OPC_STG:
@@ -499,68 +585,65 @@ static void print_instr_cat6(instr_t *instr)
 		break;
 
 	case OPC_STI:
-		full = false;  // XXX or inverts??
+		dst.full = false;  // XXX or inverts??
 		break;
 	}
 
-	if (cat6->has_off) {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->a.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->a.src1), true,
-				false, false, cat6->a.src1_im, false, false, false);
-		if (cat6->a.off)
-			printf("%+d", cat6->a.off);
-		if (ss)
-			printf("]");
-		printf(", ");
-		print_reg_src((reg_t)(cat6->a.src2), full,
-				false, false, cat6->a.src2_im, false, false, false);
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
 	} else {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->b.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->b.src1), true,
-				false, false, cat6->b.src1_im, false, false, false);
-		if (ss)
+		dst.reg = (reg_t)(cat6->d.dst);
+	}
+
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			printf("%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(&dst);
+		if (dstoff)
+			printf("%+d", dstoff);
+		if (sd)
 			printf("]");
 		printf(", ");
-		print_reg_src((reg_t)(cat6->b.src2), full,
-				false, false, cat6->b.src2_im, false, false, false);
 	}
 
-	if (debug & PRINT_VERBOSE) {
-		switch (cat6->opc) {
-		case OPC_LDG:
-		case OPC_LDP:
-			/* load instructions: */
-			if (cat6->a.dummy2|cat6->a.dummy3)
-				printf("\t{6: %x,%x}", cat6->a.dummy2, cat6->a.dummy3);
-			break;
-		case OPC_STG:
-		case OPC_STP:
-		case OPC_STI:
-			/* store instructions: */
-			if (cat6->b.dummy2|cat6->b.dummy2)
-				printf("\t{6: %x,%x}", cat6->b.dummy2, cat6->b.dummy3);
-			if (cat6->b.ignore0)
-				printf("\t{?? %x}", cat6->b.ignore0);
-			break;
-		}
+	if (ss)
+		printf("%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		printf("%u", src1.reg.dummy13);
+	} else {
+		print_src(&src1);
+	}
+
+	if (src1off)
+		printf("%+d", src1off);
+	if (ss)
+		printf("]");
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		printf(", ");
+		print_src(&src2);
+		break;
 	}
 }
 
@@ -711,19 +794,19 @@ struct opc_info {
 	OPC(6, OPC_LDLW,         ldlw),
 	OPC(6, OPC_STLW,         stlw),
 	OPC(6, OPC_RESFMT,       resfmt),
-	OPC(6, OPC_RESINFO,      resinf),
-	OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
-	OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
-	OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
-	OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
-	OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
-	OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
-	OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
-	OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
-	OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
-	OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
-	OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
-	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.3d),
 	OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
 	OPC(6, OPC_STIB,         stib),
 	OPC(6, OPC_LDC_4,        ldc.4),
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index efb07ea479e..c3fb68d511c 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -173,17 +173,17 @@ typedef enum {
 	OPC_STLW = 11,
 	OPC_RESFMT = 14,
 	OPC_RESINFO = 15,
-	OPC_ATOMIC_ADD_L = 16,
-	OPC_ATOMIC_SUB_L = 17,
-	OPC_ATOMIC_XCHG_L = 18,
-	OPC_ATOMIC_INC_L = 19,
-	OPC_ATOMIC_DEC_L = 20,
-	OPC_ATOMIC_CMPXCHG_L = 21,
-	OPC_ATOMIC_MIN_L = 22,
-	OPC_ATOMIC_MAX_L = 23,
-	OPC_ATOMIC_AND_L = 24,
-	OPC_ATOMIC_OR_L = 25,
-	OPC_ATOMIC_XOR_L = 26,
+	OPC_ATOMIC_ADD = 16,
+	OPC_ATOMIC_SUB = 17,
+	OPC_ATOMIC_XCHG = 18,
+	OPC_ATOMIC_INC = 19,
+	OPC_ATOMIC_DEC = 20,
+	OPC_ATOMIC_CMPXCHG = 21,
+	OPC_ATOMIC_MIN = 22,
+	OPC_ATOMIC_MAX = 23,
+	OPC_ATOMIC_AND = 24,
+	OPC_ATOMIC_OR = 25,
+	OPC_ATOMIC_XOR = 26,
 	OPC_LDGB_TYPED_4D = 27,
 	OPC_STGB_4D_4 = 28,
 	OPC_STIB = 29,
@@ -575,7 +575,7 @@ typedef struct PACKED {
 	uint32_t opc_cat  : 3;
 } instr_cat5_t;
 
-/* [src1 + off], src2: */
+/* dword0 encoding for src_off: [src1 + off], src2: */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe1  : 1;
@@ -586,37 +586,50 @@ typedef struct PACKED {
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6a_t;
 
-/* [src1], src2: */
+/* dword0 encoding for !src_off: [src1], src2 */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe0  : 1;
-	uint32_t src1     : 8;
-	uint32_t ignore0  : 13;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
 	uint32_t src1_im  : 1;
 	uint32_t src2_im  : 1;
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6b_t;
 
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t pad0     : 23;
+} instr_cat6d_t;
+
 /* I think some of the other cat6 instructions use additional
  * sub-encodings..
  */
@@ -624,16 +637,20 @@ typedef struct PACKED {
 typedef union PACKED {
 	instr_cat6a_t a;
 	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
 	struct PACKED {
 		/* dword0: */
-		uint32_t has_off  : 1;
+		uint32_t src_off  : 1;
 		uint32_t pad1     : 31;
 
 		/* dword1: */
-		uint32_t dst      : 8;
-		uint32_t dummy2   : 9;
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
 		uint32_t type     : 3;
-		uint32_t dummy3   : 2;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
 		uint32_t opc      : 5;
 		uint32_t jmp_tgt  : 1;
 		uint32_t sync     : 1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index a166b67d7cf..b24825cff85 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -499,32 +499,51 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr,
 static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		struct ir3_info *info)
 {
-	struct ir3_register *dst  = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	struct ir3_register *dst, *src1, *src2;
 	instr_cat6_t *cat6 = ptr;
 
-	iassert(instr->regs_count >= 2);
+	/* the "dst" for a store instruction is (from the perspective
+	 * of data flow in the shader, ie. register use/def, etc) in
+	 * fact a register that is read by the instruction, rather
+	 * than written:
+	 */
+	if (is_store(instr)) {
+		iassert(instr->regs_count >= 3);
 
-	if (instr->cat6.offset || instr->opc == OPC_LDG) {
+		dst  = instr->regs[1];
+		src1 = instr->regs[2];
+		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+	} else {
+		iassert(instr->regs_count >= 2);
+
+		dst  = instr->regs[0];
+		src1 = instr->regs[1];
+		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	}
+
+
+	/* TODO we need a more comprehensive list about which instructions
+	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+	 * indicate to use the src_off encoding even if offset is zero
+	 * (but then what to do about dst_off?)
+	 */
+	if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
 		instr_cat6a_t *cat6a = ptr;
 
-		cat6->has_off = true;
+		cat6->src_off = true;
 
-		cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
 			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
 			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
 		}
-		cat6a->off = instr->cat6.offset;
+		cat6a->off = instr->cat6.src_offset;
 	} else {
 		instr_cat6b_t *cat6b = ptr;
 
-		cat6->has_off = false;
+		cat6->src_off = false;
 
-		cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
@@ -533,10 +552,22 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		}
 	}
 
+	if (instr->cat6.dst_offset || (instr->opc == OPC_STG)) {
+		instr_cat6c_t *cat6c = ptr;
+		cat6->dst_off = true;
+		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		cat6c->off = instr->cat6.dst_offset;
+	} else {
+		instr_cat6d_t *cat6d = ptr;
+		cat6->dst_off = false;
+		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	}
+
 	cat6->type     = instr->cat6.type;
 	cat6->opc      = instr->opc;
 	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
 	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
 	cat6->opc_cat  = 6;
 
 	return 0;
@@ -669,7 +700,6 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
 	return ir3_instr_create2(block, category, opc, 4);
 }
 
-/* only used by old compiler: */
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *new_instr = instr_create(instr->block,
@@ -708,6 +738,17 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 }
 
 void
+ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr)
+{
+	if (instr->address != addr) {
+		struct ir3 *ir = instr->block->shader;
+		instr->address = addr;
+		array_insert(ir->indirects, instr);
+	}
+}
+
+void
 ir3_block_clear_mark(struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
@@ -723,15 +764,16 @@ ir3_clear_mark(struct ir3 *ir)
 }
 
 /* note: this will destroy instr->depth, don't do it until after sched! */
-void
+unsigned
 ir3_count_instructions(struct ir3 *ir)
 {
-	unsigned ip = 0;
+	unsigned cnt = 0;
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			instr->ip = ip++;
+			instr->ip = cnt++;
 		}
 		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 	}
+	return cnt;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 9c35a763d58..12f2ebe18db 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -172,6 +172,7 @@ struct ir3_instruction {
 		IR3_INSTR_P     = 0x080,
 		IR3_INSTR_S     = 0x100,
 		IR3_INSTR_S2EN  = 0x200,
+		IR3_INSTR_G     = 0x400,
 		/* meta-flags, for intermediate stages of IR, ie.
 		 * before register assignment is done:
 		 */
@@ -209,7 +210,8 @@ struct ir3_instruction {
 		} cat5;
 		struct {
 			type_t type;
-			int offset;
+			int src_offset;
+			int dst_offset;
 			int iim_val;
 		} cat6;
 		/* for meta-instructions, just used to hold extra data
@@ -285,6 +287,8 @@ struct ir3_instruction {
 
 	/* an instruction can reference at most one address register amongst
 	 * it's src/dst registers.  Beyond that, you need to insert mov's.
+	 *
+	 * NOTE: do not write this directly, use ir3_instr_set_address()
 	 */
 	struct ir3_instruction *address;
 
@@ -365,6 +369,12 @@ struct ir3 {
 	unsigned predicates_count, predicates_sz;
 	struct ir3_instruction **predicates;
 
+	/* Track instructions which do not write a register but other-
+	 * wise must not be discarded (such as kill, stg, etc)
+	 */
+	unsigned keeps_count, keeps_sz;
+	struct ir3_instruction **keeps;
+
 	/* List of blocks: */
 	struct list_head block_list;
 
@@ -420,6 +430,9 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
+void ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr);
+
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
@@ -431,7 +444,7 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 void ir3_block_clear_mark(struct ir3_block *block);
 void ir3_clear_mark(struct ir3 *shader);
 
-void ir3_count_instructions(struct ir3 *ir);
+unsigned ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
@@ -547,6 +560,26 @@ is_store(struct ir3_instruction *instr)
 	return false;
 }
 
+static inline bool is_load(struct ir3_instruction *instr)
+{
+	if (is_mem(instr)) {
+		switch (instr->opc) {
+		case OPC_LDG:
+		case OPC_LDL:
+		case OPC_LDP:
+		case OPC_L2G:
+		case OPC_LDLW:
+		case OPC_LDC_4:
+		case OPC_LDLV:
+		/* probably some others too.. */
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
 	/* in some cases, ldlv is used to fetch varying without
@@ -1036,6 +1069,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 /* cat6 instructions: */
 INSTR2(6, LDLV)
 INSTR2(6, LDG)
+INSTR3(6, STG)
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ad9d2719d59..ede29f445dc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -43,127 +43,15 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 
-static void dump_reg(const char *name, uint32_t r)
-{
-	if (r != regid(63,0))
-		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
-}
-
-static void dump_semantic(struct ir3_shader_variant *so,
-		unsigned sem, const char *name)
-{
-	uint32_t regid;
-	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
-	dump_reg(name, regid);
-}
-
 static void dump_info(struct ir3_shader_variant *so, const char *str)
 {
 	uint32_t *bin;
-	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
-
-	// for debug, dump some before/after info:
+	const char *type = ir3_shader_stage(so->shader);
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		struct ir3 *ir = so->ir;
-		struct ir3_register *reg;
-		uint8_t regid;
-		unsigned i;
-
-		debug_printf("; %s: %s\n", type, str);
-
-		for (i = 0; i < ir->ninputs; i++) {
-			if (!ir->inputs[i]) {
-				debug_printf("; in%d unused\n", i);
-				continue;
-			}
-			reg = ir->inputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@in(%sr%d.%c)\tin%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < ir->noutputs; i++) {
-			if (!ir->outputs[i]) {
-				debug_printf("; out%d unused\n", i);
-				continue;
-			}
-			/* kill shows up as a virtual output.. skip it! */
-			if (is_kill(ir->outputs[i]))
-				continue;
-			reg = ir->outputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@out(%sr%d.%c)\tout%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < so->immediates_count; i++) {
-			debug_printf("@const(c%d.x)\t", so->first_immediate + i);
-			debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-					so->immediates[i].val[0],
-					so->immediates[i].val[1],
-					so->immediates[i].val[2],
-					so->immediates[i].val[3]);
-		}
-
-		disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
-
-		debug_printf("; %s: outputs:", type);
-		for (i = 0; i < so->outputs_count; i++) {
-			uint8_t regid = so->outputs[i].regid;
-			ir3_semantic sem = so->outputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem));
-		}
-		debug_printf("\n");
-		debug_printf("; %s: inputs:", type);
-		for (i = 0; i < so->inputs_count; i++) {
-			uint8_t regid = so->inputs[i].regid;
-			ir3_semantic sem = so->inputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem),
-					so->inputs[i].compmask,
-					so->inputs[i].inloc,
-					so->inputs[i].bary);
-		}
-		debug_printf("\n");
-	}
-
-	/* print generic shader info: */
-	debug_printf("; %s: %u instructions, %d half, %d full\n", type,
-			so->info.instrs_count,
-			so->info.max_half_reg + 1,
-			so->info.max_reg + 1);
-
-	/* print shader type specific info: */
-	switch (so->type) {
-	case SHADER_VERTEX:
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
-		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
-		break;
-	case SHADER_FRAGMENT:
-		dump_reg("pos (bary)", so->pos_regid);
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
-		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
-		/* these two are hard-coded since we don't know how to
-		 * program them to anything but all 0's...
-		 */
-		if (so->frag_coord)
-			debug_printf("; fragcoord: r0.x\n");
-		if (so->frag_face)
-			debug_printf("; fragface: hr0.x\n");
-		break;
-	case SHADER_COMPUTE:
-		break;
-	}
+	debug_printf("; %s: %s\n", type, str);
+	ir3_shader_disasm(so, bin);
 	free(bin);
-
-	debug_printf("\n");
 }
 
 
@@ -205,8 +93,7 @@ static void print_usage(void)
 	printf("    --saturate-s MASK - bitmask of samplers to saturate S coord\n");
 	printf("    --saturate-t MASK - bitmask of samplers to saturate T coord\n");
 	printf("    --saturate-r MASK - bitmask of samplers to saturate R coord\n");
-	printf("    --nocp            - disable copy propagation\n");
-	printf("    --nir             - use NIR compiler\n");
+	printf("    --stream-out      - enable stream-out (aka transform feedback)\n");
 	printf("    --help            - show this message\n");
 }
 
@@ -218,6 +105,7 @@ int main(int argc, char **argv)
 	struct tgsi_parse_context parse;
 	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
+	struct ir3_shader s;
 	struct ir3_shader_key key = {};
 	const char *info;
 	void *ptr;
@@ -225,6 +113,9 @@ int main(int argc, char **argv)
 
 	fd_mesa_debug |= FD_DBG_DISASM;
 
+	memset(&s, 0, sizeof(s));
+	memset(&v, 0, sizeof(v));
+
 	/* cmdline args which impact shader variant get spit out in a
 	 * comment on the first line..  a quick/dirty way to preserve
 	 * that info so when ir3test recompiles the shader with a new
@@ -281,6 +172,24 @@ int main(int argc, char **argv)
 			continue;
 		}
 
+		if (!strcmp(argv[n], "--stream-out")) {
+			struct pipe_stream_output_info *so = &s.stream_output;
+			debug_printf(" %s", argv[n]);
+			/* TODO more dynamic config based on number of outputs, etc
+			 * rather than just hard-code for first output:
+			 */
+			so->num_outputs = 1;
+			so->stride[0] = 4;
+			so->output[0].register_index = 0;
+			so->output[0].start_component = 0;
+			so->output[0].num_components = 4;
+			so->output[0].output_buffer = 0;
+			so->output[0].dst_offset = 2;
+			so->output[0].stream = 0;
+			n++;
+			continue;
+		}
+
 		if (!strcmp(argv[n], "--help")) {
 			print_usage();
 			return 0;
@@ -292,9 +201,6 @@ int main(int argc, char **argv)
 
 	filename = argv[n];
 
-	memset(&v, 0, sizeof(v));
-	v.key = key;
-
 	ret = read_file(filename, &ptr, &size);
 	if (ret) {
 		print_usage();
@@ -307,16 +213,21 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
+	s.tokens = toks;
+
+	v.key = key;
+	v.shader = &s;
+
 	tgsi_parse_init(&parse, toks);
 	switch (parse.FullHeader.Processor.Processor) {
 	case TGSI_PROCESSOR_FRAGMENT:
-		v.type = SHADER_FRAGMENT;
+		s.type = v.type = SHADER_FRAGMENT;
 		break;
 	case TGSI_PROCESSOR_VERTEX:
-		v.type = SHADER_VERTEX;
+		s.type = v.type = SHADER_VERTEX;
 		break;
 	case TGSI_PROCESSOR_COMPUTE:
-		v.type = SHADER_COMPUTE;
+		s.type = v.type = SHADER_COMPUTE;
 		break;
 	}
 
@@ -324,7 +235,7 @@ int main(int argc, char **argv)
 	compiler = ir3_compiler_create(320);
 
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v, toks, key);
+	ret = ir3_compile_shader_nir(compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 86b1161d9cb..697afeba61a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -36,14 +36,13 @@ struct ir3_ra_reg_set;
 struct ir3_compiler {
 	uint32_t gpu_id;
 	struct ir3_ra_reg_set *set;
+	uint32_t shader_count;
 };
 
 struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
 
 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key);
+		struct ir3_shader_variant *so);
 
 #endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 48b1d8f3606..0ab33455ed1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -117,10 +117,6 @@ struct ir3_compile {
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
-	/* list of kill instructions: */
-	struct ir3_instruction *kill[16];
-	unsigned int kill_count;
-
 	/* set if we encounter something we can't handle yet, so we
 	 * can bail cleanly and fallback to TGSI compiler f/e
 	 */
@@ -153,6 +149,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 	nir_opt_global_to_local(s);
 	nir_convert_to_ssa(s);
 	nir_lower_idiv(s);
+	nir_lower_load_const_to_scalar(s);
 
 	do {
 		progress = false;
@@ -261,13 +258,29 @@ compile_init(struct ir3_compiler *compiler,
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
-	/* one (vec4) slot for vertex id base: */
-	if (so->type == SHADER_VERTEX)
-		so->first_immediate++;
+	/* Layout of constant registers:
+	 *
+	 *    num_uniform * vec4  -  user consts
+	 *    4 * vec4            -  UBO addresses
+	 *    if (vertex shader) {
+	 *        1 * vec4        -  driver params (IR3_DP_*)
+	 *        1 * vec4        -  stream-out addresses
+	 *    }
+	 *
+	 * TODO this could be made more dynamic, to at least skip sections
+	 * that we don't need..
+	 */
 
 	/* reserve 4 (vec4) slots for ubo base addresses: */
 	so->first_immediate += 4;
 
+	if (so->type == SHADER_VERTEX) {
+		/* one (vec4) slot for driver params (see ir3_driver_param): */
+		so->first_immediate++;
+		/* one (vec4) slot for stream-output base addresses: */
+		so->first_immediate++;
+	}
+
 	return ctx;
 }
 
@@ -637,9 +650,8 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
 	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -677,9 +689,8 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src->instr = collect;
 	src->size  = arrsz;
 	src->offset = n;
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -700,25 +711,21 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	dst->size  = arrsz;
 	dst->offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->address = address;
 	mov->fanin = collect;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
 
 static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
+create_input(struct ir3_block *block, unsigned n)
 {
 	struct ir3_instruction *in;
 
 	in = ir3_instr_create(block, -1, OPC_META_INPUT);
 	in->inout.block = block;
 	ir3_reg_create(in, n, 0);
-	if (instr)
-		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
 
 	return in;
 }
@@ -750,7 +757,7 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 
 	compile_assert(ctx, !ctx->frag_coord[comp]);
 
-	ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
+	ctx->frag_coord[comp] = create_input(ctx->block, 0);
 
 	switch (comp) {
 	case 0: /* .x */
@@ -789,7 +796,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	case 0: /* .x */
 		compile_assert(ctx, !ctx->frag_face);
 
-		ctx->frag_face = create_input(block, NULL, 0);
+		ctx->frag_face = create_input(block, 0);
 		ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 
 		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
@@ -817,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	}
 }
 
+static struct ir3_instruction *
+create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
+{
+	/* first four vec4 sysval's reserved for UBOs: */
+	unsigned r = regid(ctx->so->first_driver_param + 4, dp);
+	return create_uniform(ctx, r);
+}
+
 /* helper for instructions that produce multiple consecutive scalar
  * outputs which need to have a split/fanout meta instruction inserted
  */
@@ -1218,7 +1233,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction *load =
 				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
 		load->cat6.type = TYPE_U32;
-		load->cat6.offset = off + i * 4;    /* byte offset */
+		load->cat6.src_offset = off + i * 4;     /* byte offset */
 		dst[i] = load;
 	}
 }
@@ -1307,7 +1322,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			 * store_output_indirect? or move this into
 			 * create_indirect_store()?
 			 */
-			for (int j = i; j < arr->length; j += 4) {
+			for (int j = i; j < arr->length; j += intr->num_components) {
 				struct ir3_instruction *split;
 
 				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
@@ -1318,6 +1333,13 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 				arr->arr[j] = split;
 			}
 		}
+		/* fixup fanout/split neighbors: */
+		for (int i = 0; i < arr->length; i++) {
+			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
+					arr->arr[i+1] : NULL;
+			arr->arr[i]->cp.left = (i > 0) ?
+					arr->arr[i-1] : NULL;
+		}
 		break;
 	}
 	default:
@@ -1372,6 +1394,11 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_uniform_indirect(ctx, n,
 					get_addr(ctx, src[0]));
 		}
+		/* NOTE: if relative addressing is used, we set constlen in
+		 * the compiler (to worst-case value) since we don't know in
+		 * the assembler what the max addr reg value can be:
+		 */
+		ctx->so->constlen = ctx->s->num_uniforms;
 		break;
 	case nir_intrinsic_load_ubo:
 	case nir_intrinsic_load_ubo_indirect:
@@ -1409,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_base_vertex:
 		if (!ctx->basevertex) {
-			/* first four vec4 sysval's reserved for UBOs: */
-			unsigned r = regid(ctx->so->first_driver_param + 4, 0);
-			ctx->basevertex = create_uniform(ctx, r);
+			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
 			add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
 					ctx->basevertex);
 		}
@@ -1419,7 +1444,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_vertex_id_zero_base:
 		if (!ctx->vertex_id) {
-			ctx->vertex_id = create_input(ctx->block, NULL, 0);
+			ctx->vertex_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
 					ctx->vertex_id);
 		}
@@ -1427,7 +1452,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_instance_id:
 		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx->block, NULL, 0);
+			ctx->instance_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
 					ctx->instance_id);
 		}
@@ -1456,7 +1481,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		kill = ir3_KILL(b, cond, 0);
 		array_insert(ctx->ir->predicates, kill);
 
-		ctx->kill[ctx->kill_count++] = kill;
+		array_insert(ctx->ir->keeps, kill);
 		ctx->so->has_kill = true;
 
 		break;
@@ -1950,6 +1975,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 	}
 }
 
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_compile *ctx)
+{
+	struct ir3_shader_variant *v = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	struct pipe_stream_output_info *strmout =
+			&ctx->so->shader->stream_output;
+	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+	struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
+
+	/* create vtxcnt input in input block at top of shader,
+	 * so that it is seen as live over the entire duration
+	 * of the shader:
+	 */
+	vtxcnt = create_input(ctx->in_block, 0);
+	add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+
+	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+	/* at this point, we are at the original 'end' block,
+	 * re-purpose this block to stream-out condition, then
+	 * append stream-out block and new-end block
+	 */
+	orig_end_block = ctx->block;
+
+	stream_out_block = ir3_block_create(ir);
+	list_addtail(&stream_out_block->node, &ir->block_list);
+
+	new_end_block = ir3_block_create(ir);
+	list_addtail(&new_end_block->node, &ir->block_list);
+
+	orig_end_block->successors[0] = stream_out_block;
+	orig_end_block->successors[1] = new_end_block;
+	stream_out_block->successors[0] = new_end_block;
+
+	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->cat2.condition = IR3_COND_LT;
+
+	/* condition goes on previous block to the conditional,
+	 * since it is used to pick which of the two successor
+	 * paths to take:
+	 */
+	orig_end_block->condition = cond;
+
+	/* switch to stream_out_block to generate the stream-out
+	 * instructions:
+	 */
+	ctx->block = stream_out_block;
+
+	/* Calculate base addresses based on vtxcnt.  Instructions
+	 * generated for bases not used in following loop will be
+	 * stripped out in the backend.
+	 */
+	for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+		unsigned stride = strmout->stride[i];
+		struct ir3_instruction *base, *off;
+
+		base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+
+		/* 24-bit should be enough: */
+		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+				create_immed(ctx->block, stride * 4), 0);
+
+		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+	}
+
+	/* Generate the per-output store instructions: */
+	for (unsigned i = 0; i < strmout->num_outputs; i++) {
+		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+			unsigned c = j + strmout->output[i].start_component;
+			struct ir3_instruction *base, *out, *stg;
+
+			base = bases[strmout->output[i].output_buffer];
+			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+			stg = ir3_STG(ctx->block, base, 0, out, 0,
+					create_immed(ctx->block, 1), 0);
+			stg->cat6.type = TYPE_U32;
+			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+			array_insert(ctx->ir->keeps, stg);
+		}
+	}
+
+	/* and finally switch to the new_end_block: */
+	ctx->block = new_end_block;
+}
+
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
@@ -1960,6 +2094,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 	 * into which we emit the 'end' instruction.
 	 */
 	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+	/* If stream-out (aka transform-feedback) enabled, emit the
+	 * stream-out instructions, followed by a new empty block (into
+	 * which the 'end' instruction lands).
+	 *
+	 * NOTE: it is done in this order, rather than inserting before
+	 * we emit end_block, because NIR guarantees that all blocks
+	 * flow into end_block, and that end_block has no successors.
+	 * So by re-purposing end_block as the first block of stream-
+	 * out, we guarantee that all exit paths flow into the stream-
+	 * out instructions.
+	 */
+	if ((ctx->so->shader->stream_output.num_outputs > 0) &&
+			!ctx->so->key.binning_pass) {
+		debug_assert(ctx->so->type == SHADER_VERTEX);
+		emit_stream_out(ctx);
+	}
+
 	ir3_END(ctx->block);
 }
 
@@ -1974,7 +2126,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 	unsigned semantic_index = in->data.index;
 	unsigned n = in->data.driver_location;
 
-	DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; in: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
@@ -2045,7 +2197,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 						so->inputs[n].inloc + i - 8, use_ldlv);
 			}
 		} else {
-			instr = create_input(ctx->block, NULL, idx);
+			instr = create_input(ctx->block, idx);
 		}
 
 		ctx->ir->inputs[idx] = instr;
@@ -2069,7 +2221,7 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 	unsigned n = out->data.driver_location;
 	unsigned comp = 0;
 
-	DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; out: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
@@ -2098,6 +2250,10 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 			so->writes_pos = true;
 			break;
 		case TGSI_SEMANTIC_COLOR:
+			if (semantic_index == -1) {
+				semantic_index = 0;
+				so->color0_mrt = 1;
+			}
 			break;
 		default:
 			compile_error(ctx, "unknown FS semantic name: %s\n",
@@ -2136,13 +2292,9 @@ emit_instructions(struct ir3_compile *ctx)
 	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
 	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
-	/* we need to allocate big enough outputs array so that
-	 * we can stuff the kill's at the end.  Likewise for vtx
-	 * shaders, we need to leave room for sysvals:
+	/* or vtx shaders, we need to leave room for sysvals:
 	 */
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		noutputs += ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ninputs += 8;
 	}
 
@@ -2153,9 +2305,7 @@ emit_instructions(struct ir3_compile *ctx)
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ctx->ir->ninputs -= 8;
 	}
 
@@ -2254,13 +2404,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;
@@ -2270,9 +2420,7 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 
 int
 ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx;
 	struct ir3 *ir;
@@ -2282,7 +2430,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, tokens);
+	ctx = compile_init(compiler, so, so->shader->tokens);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
@@ -2307,7 +2455,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		fixup_frag_inputs(ctx);
 
 	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (key.binning_pass) {
+	if (so->key.binning_pass) {
 		for (i = 0, j = 0; i < so->outputs_count; i++) {
 			unsigned name = sem2name(so->outputs[i].semantic);
 			unsigned idx = sem2idx(so->outputs[i].semantic);
@@ -2332,7 +2480,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	/* if we want half-precision outputs, mark the output registers
 	 * as half:
 	 */
-	if (key.half_precision) {
+	if (so->key.half_precision) {
 		for (i = 0; i < ir->noutputs; i++) {
 			struct ir3_instruction *out = ir->outputs[i];
 			if (!out)
@@ -2353,15 +2501,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		}
 	}
 
-	/* at this point, we want the kill's in the outputs array too,
-	 * so that they get scheduled (since they have no dst).. we've
-	 * already ensured that the array is big enough in push_block():
-	 */
-	if (so->type == SHADER_FRAGMENT) {
-		for (i = 0; i < ctx->kill_count; i++)
-			ir->outputs[ir->noutputs++] = ctx->kill[i];
-	}
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE CP:\n");
 		ir3_print(ir);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 8c7c80f7aae..be4e4e81109 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -291,7 +291,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			instr->regs[n+1] = src_reg;
 
 			if (src_reg->flags & IR3_REG_RELATIV)
-				instr->address = reg->instr->address;
+				ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -300,7 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 				!conflicts(instr->address, reg->instr->address)) {
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
-			instr->address = reg->instr->address;
+			ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -389,7 +389,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 	}
 
 	if (instr->address)
-		instr->address = instr_cp(instr->address, NULL);
+		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
 
 	return instr;
 }
@@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir)
 		}
 	}
 
+	for (unsigned i = 0; i < ir->keeps_count; i++) {
+		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+	}
+
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)
 			block->condition = instr_cp(block->condition, NULL);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 3a108243479..97df0c2ac99 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir)
 		if (ir->outputs[i])
 			ir3_instr_depth(ir->outputs[i]);
 
+	for (i = 0; i < ir->keeps_count; i++)
+		ir3_instr_depth(ir->keeps[i]);
+
 	/* We also need to account for if-condition: */
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)
@@ -167,6 +170,15 @@ ir3_depth(struct ir3 *ir)
 		remove_unused_by_block(block);
 	}
 
+	/* note that we can end up with unused indirects, but we should
+	 * not end up with unused predicates.
+	 */
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
+		if (instr->depth == DEPTH_UNUSED)
+			ir->indirects[i] = NULL;
+	}
+
 	/* cleanup unused inputs: */
 	for (i = 0; i < ir->ninputs; i++) {
 		struct ir3_instruction *in = ir->inputs[i];
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 70d9b08e019..ca28aefd502 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -236,6 +236,11 @@ find_neighbors(struct ir3 *ir)
 			instr_find_neighbors(instr);
 		}
 	}
+
+	for (i = 0; i < ir->keeps_count; i++) {
+		struct ir3_instruction *instr = ir->keeps[i];
+		instr_find_neighbors(instr);
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index f4a4223ae17..e94293f6d6b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			 */
 			ctx->has_samp = true;
 			regmask_set(&needs_sy, n->regs[0]);
-		} else if (is_mem(n)) {
+		} else if (is_load(n)) {
 			regmask_set(&needs_sy, n->regs[0]);
 		}
 
 		/* both tex/sfu appear to not always immediately consume
 		 * their src register(s):
 		 */
-		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+		if (is_tex(n) || is_sfu(n) || is_load(n)) {
 			foreach_src(reg, n) {
 				if (reg_gpr(reg))
 					regmask_set(&needs_ss_war, reg);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index f377982dd5e..07e03d26908 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -175,6 +175,20 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
+	if (instr->cp.left) {
+		printf(", left=_");
+		printf("[");
+		print_instr_name(instr->cp.left);
+		printf("]");
+	}
+
+	if (instr->cp.right) {
+		printf(", right=_");
+		printf("[");
+		print_instr_name(instr->cp.right);
+		printf("]");
+	}
+
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index e5aba859fab..eaf3b3c35e8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -241,6 +241,21 @@ ir3_ra_alloc_reg_set(void *memctx)
 	return set;
 }
 
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+	/* cached instruction 'definer' info: */
+	struct ir3_instruction *defn;
+	int off, sz, cls;
+};
+
 /* register-assign context, per-shader */
 struct ir3_ra_ctx {
 	struct ir3 *ir;
@@ -254,14 +269,7 @@ struct ir3_ra_ctx {
 	unsigned class_base[total_class_count];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
-};
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+	struct ir3_ra_instr_data *instrd;
 };
 
 static bool
@@ -291,8 +299,6 @@ is_temp(struct ir3_register *reg)
 {
 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 		return false;
-	if (reg->flags & IR3_REG_RELATIV) // TODO
-		return false;
 	if ((reg->num == regid(REG_A0, 0)) ||
 			(reg->num == regid(REG_P0, 0)))
 		return false;
@@ -309,28 +315,45 @@ writes_gpr(struct ir3_instruction *instr)
 }
 
 static struct ir3_instruction *
-get_definer(struct ir3_instruction *instr, int *sz, int *off)
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+		int *sz, int *off)
 {
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;
+
+	if (instr->fanin)
+		return get_definer(ctx, instr->fanin, sz, off);
+
+	if (id->defn) {
+		*sz = id->sz;
+		*off = id->off;
+		return id->defn;
+	}
+
 	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 		/* What about the case where collect is subset of array, we
 		 * need to find the distance between where actual array starts
 		 * and fanin..  that probably doesn't happen currently.
 		 */
 		struct ir3_register *src;
+		int dsz, doff;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
 		 */
-		foreach_src(src, instr) {
+		foreach_src_n(src, n, instr) {
+			struct ir3_instruction *dd;
 			if (!src->instr)
 				continue;
-			if ((!d) || (src->instr->ip < d->ip))
-				d = src->instr;
-		}
 
-		*sz = instr->regs_count - 1;
-		*off = 0;
+			dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+			if ((!d) || (dd->ip < d->ip)) {
+				d = dd;
+				*sz = dsz;
+				*off = doff - n;
+			}
+		}
 
 	} else if (instr->cp.right || instr->cp.left) {
 		/* covers also the meta:fo case, which ends up w/ single
@@ -386,7 +409,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(phi, &dsz, &doff);
+		dd = get_definer(ctx, phi, &dsz, &doff);
 
 		*sz = MAX2(*sz, dsz);
 		*off = doff;
@@ -401,6 +424,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		 * the phi, so we don't need to chase definers
 		 */
 		struct ir3_register *src;
+		struct ir3_instruction *dd = d;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
@@ -408,16 +432,18 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		foreach_src(src, d) {
 			if (!src->instr)
 				continue;
-			if (src->instr->ip < d->ip)
-				d = src->instr;
+			if (src->instr->ip < dd->ip)
+				dd = src->instr;
 		}
+
+		d = dd;
 	}
 
 	if (is_meta(d) && (d->opc == OPC_META_FO)) {
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
 
 		/* by definition, should come before: */
 		debug_assert(dd->ip < d->ip);
@@ -429,9 +455,30 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		d = dd;
 	}
 
+	id->defn = d;
+	id->sz = *sz;
+	id->off = *off;
+
 	return d;
 }
 
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		if (instr->regs_count == 0)
+			continue;
+		/* couple special cases: */
+		if (writes_addr(instr) || writes_pred(instr)) {
+			id->cls = -1;
+			continue;
+		}
+		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+		id->cls = size_to_class(id->sz, is_half(id->defn));
+	}
+}
+
 /* give each instruction a name (and ip), and count up the # of names
  * of each class
  */
@@ -439,8 +486,11 @@ static void
 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+		instr->name = ~0;
+#endif
 
 		ctx->instr_cnt++;
 
@@ -450,9 +500,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (!writes_gpr(instr))
 			continue;
 
-		defn = get_definer(instr, &sz, &off);
-
-		if (defn != instr)
+		if (id->defn != instr)
 			continue;
 
 		/* arrays which don't fit in one of the pre-defined class
@@ -460,9 +508,8 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 *
 		 * TODO but we still need to allocate names for them, don't we??
 		 */
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			instr->name = ctx->class_alloc_count[cls]++;
+		if (id->cls >= 0) {
+			instr->name = ctx->class_alloc_count[id->cls]++;
 			ctx->alloc_count++;
 		}
 	}
@@ -471,8 +518,16 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
+	unsigned n;
+
 	ir3_clear_mark(ctx->ir);
-	ir3_count_instructions(ctx->ir);
+	n = ir3_count_instructions(ctx->ir);
+
+	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_find_definers(ctx, block);
+	}
 
 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 		ra_block_name_instructions(ctx, block);
@@ -488,6 +543,7 @@ ra_init(struct ir3_ra_ctx *ctx)
 	}
 
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 }
@@ -555,39 +611,36 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 */
 
 		if (writes_gpr(instr)) {
-			struct ir3_instruction *defn;
-			int cls, sz, off;
+			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-			defn = get_definer(instr, &sz, &off);
-			if (defn == instr) {
+			if (id->defn == instr) {
 				/* arrays which don't fit in one of the pre-defined class
 				 * sizes are pre-colored:
 				 */
-				cls = size_to_class(sz, is_half(defn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, defn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 
-					ctx->def[name] = defn->ip;
-					ctx->use[name] = defn->ip;
+					ctx->def[name] = id->defn->ip;
+					ctx->use[name] = id->defn->ip;
 
 					/* since we are in SSA at this point: */
 					debug_assert(!BITSET_TEST(bd->use, name));
 
 					BITSET_SET(bd->def, name);
 
-					if (is_half(defn)) {
+					if (is_half(id->defn)) {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[cls - class_count]);
+								ctx->set->half_classes[id->cls - class_count]);
 					} else {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[cls]);
+								ctx->set->classes[id->cls]);
 					}
 
 					/* extend the live range for phi srcs, which may come
 					 * from the bottom of the loop
 					 */
-					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = defn->regs[0]->instr;
+					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = id->defn->regs[0]->instr;
 						foreach_ssa_src(src, phi) {
 							/* if src is after phi, then we need to extend
 							 * the liverange to the end of src's block:
@@ -606,13 +659,10 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_ssa_src(src, instr) {
 			if (writes_gpr(src)) {
-				struct ir3_instruction *srcdefn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
 
-				srcdefn = get_definer(src, &sz, &off);
-				cls = size_to_class(sz, is_half(srcdefn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, srcdefn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
 					if (!BITSET_TEST(bd->def, name))
 						BITSET_SET(bd->use, name);
@@ -704,13 +754,10 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-		defn = get_definer(instr, &sz, &off);
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			unsigned name = ra_name(ctx, cls, defn);
+		if (id->cls >= 0) {
+			unsigned name = ra_name(ctx, id->cls, id->defn);
 			ctx->use[name] = ctx->instr_cnt;
 		}
 	}
@@ -780,15 +827,12 @@ static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *defn;
-	int cls, sz, off;
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-	defn = get_definer(instr, &sz, &off);
-	cls = size_to_class(sz, is_half(defn));
-	if (cls >= 0) {
-		unsigned name = ra_name(ctx, cls, defn);
+	if (id->cls >= 0) {
+		unsigned name = ra_name(ctx, id->cls, id->defn);
 		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
 		if (reg->flags & IR3_REG_RELATIV)
 			num += reg->offset;
@@ -796,7 +840,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
-		if (is_half(defn))
+		if (is_half(id->defn))
 			reg->flags |= IR3_REG_HALF;
 	}
 }
@@ -851,19 +895,16 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		for (j = 0; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
-				struct ir3_instruction *defn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-				defn = get_definer(instr, &sz, &off);
-				if (defn == instr) {
+				if (id->defn == instr) {
 					unsigned name, reg;
 
-					cls = size_to_class(sz, is_half(defn));
-					name = ra_name(ctx, cls, defn);
-					reg = ctx->set->gpr_to_ra_reg[cls][j];
+					name = ra_name(ctx, id->cls, id->defn);
+					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);
-					j += sz;
+					j += id->sz;
 				}
 			}
 		}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 49a4426d163..2ee325518f7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -80,12 +80,12 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	list_delinit(&instr->node);
 
 	if (writes_addr(instr)) {
-		assert(ctx->addr == NULL);
+		debug_assert(ctx->addr == NULL);
 		ctx->addr = instr;
 	}
 
 	if (writes_pred(instr)) {
-		assert(ctx->pred == NULL);
+		debug_assert(ctx->pred == NULL);
 		ctx->pred = instr;
 	}
 
@@ -180,13 +180,13 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * free:
 	 */
 	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
+		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
 		return true;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
+		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
 		return true;
 	}
@@ -261,6 +261,20 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return 0;
 }
 
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
 /* move eligible instructions to the priority list: */
 static unsigned
 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
@@ -272,6 +286,31 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		int e = instr_eligibility(ctx, notes, instr);
 		if (e < 0)
 			continue;
+
+		/* For instructions that write address register we need to
+		 * make sure there is at least one instruction that uses the
+		 * addr value which is otherwise ready.
+		 *
+		 * TODO if any instructions use pred register and have other
+		 * src args, we would need to do the same for writes_pred()..
+		 */
+		if (unlikely(writes_addr(instr))) {
+			struct ir3 *ir = instr->block->shader;
+			bool ready = false;
+			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+				struct ir3_instruction *indirect = ir->indirects[i];
+				if (!indirect)
+					continue;
+				if (indirect->address != instr)
+					continue;
+				ready = could_sched(indirect, instr);
+			}
+
+			/* nothing could be scheduled, so keep looking: */
+			if (!ready)
+				continue;
+		}
+
 		min_delay = MIN2(min_delay, e);
 		if (e == 0) {
 			/* remove from unscheduled list and into priority queue: */
@@ -287,20 +326,25 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  * instructions which depend on the current address register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_addr(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_addr = NULL;
 	unsigned i;
 
 	debug_assert(ctx->addr);
 
+	ir = ctx->addr->block->shader;
+
 	for (i = 0; i < ir->indirects_count; i++) {
 		struct ir3_instruction *indirect = ir->indirects[i];
 
+		if (!indirect)
+			continue;
+
 		/* skip instructions already scheduled: */
-		if (indirect->flags & IR3_INSTR_MARK)
+		if (is_scheduled(indirect))
 			continue;
 
 		/* remap remaining instructions using current addr
@@ -312,32 +356,36 @@ split_addr(struct ir3_sched_ctx *ctx)
 				/* original addr is scheduled, but new one isn't: */
 				new_addr->flags &= ~IR3_INSTR_MARK;
 			}
-			indirect->address = new_addr;
+			ir3_instr_set_address(indirect, new_addr);
 		}
 	}
 
 	/* all remaining indirects remapped to new addr: */
 	ctx->addr = NULL;
+
+	return new_addr;
 }
 
 /* "spill" the predicate register by remapping any unscheduled
  * instructions which depend on the current predicate register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_pred(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_pred = NULL;
 	unsigned i;
 
 	debug_assert(ctx->pred);
 
+	ir = ctx->pred->block->shader;
+
 	for (i = 0; i < ir->predicates_count; i++) {
 		struct ir3_instruction *predicated = ir->predicates[i];
 
 		/* skip instructions already scheduled: */
-		if (predicated->flags & IR3_INSTR_MARK)
+		if (is_scheduled(predicated))
 			continue;
 
 		/* remap remaining instructions using current pred
@@ -358,6 +406,8 @@ split_pred(struct ir3_sched_ctx *ctx)
 
 	/* all remaining predicated remapped to new pred: */
 	ctx->pred = NULL;
+
+	return new_pred;
 }
 
 static void
@@ -407,20 +457,32 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 			schedule(ctx, instr);
 		} else if (delay == ~0) {
+			struct ir3_instruction *new_instr = NULL;
+
 			/* nothing available to schedule.. if we are blocked on
 			 * address/predicate register conflict, then break the
 			 * deadlock by cloning the instruction that wrote that
 			 * reg:
 			 */
 			if (notes.addr_conflict) {
-				split_addr(ctx);
+				new_instr = split_addr(ctx);
 			} else if (notes.pred_conflict) {
-				split_pred(ctx);
+				new_instr = split_pred(ctx);
 			} else {
 				debug_assert(0);
 				ctx->error = true;
 				return;
 			}
+
+			if (new_instr) {
+				list_del(&new_instr->node);
+				list_addtail(&new_instr->node, &unscheduled_list);
+				/* the original instr that wrote addr/pred may have
+				 * originated from a different block:
+				 */
+				new_instr->block = block;
+			}
+
 		} else {
 			/* and if we run out of instructions that can be scheduled,
 			 * then it is time for nop's:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index b5b038100cc..312174c0c6d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -46,7 +46,8 @@ delete_variant(struct ir3_shader_variant *v)
 {
 	if (v->ir)
 		ir3_destroy(v->ir);
-	fd_bo_del(v->bo);
+	if (v->bo)
+		fd_bo_del(v->bo);
 	free(v);
 }
 
@@ -139,6 +140,32 @@ assemble_variant(struct ir3_shader_variant *v)
 
 	memcpy(fd_bo_map(v->bo), bin, sz);
 
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		struct ir3_shader_key key = v->key;
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
+		ir3_shader_disasm(v, bin);
+	}
+
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* print generic shader info: */
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.instrs_count,
+				v->info.sizedwords);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u half, %u full\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_half_reg + 1,
+				v->info.max_reg + 1);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_const + 1,
+				v->constlen);
+	}
+
 	free(bin);
 
 	/* no need to keep the ir around beyond this point: */
@@ -150,12 +177,12 @@ static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
 	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
-	const struct tgsi_token *tokens = shader->tokens;
 	int ret;
 
 	if (!v)
 		return NULL;
 
+	v->id = ++shader->variant_count;
 	v->shader = shader;
 	v->key = key;
 	v->type = shader->type;
@@ -163,10 +190,10 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(tokens, 0);
+		tgsi_dump(shader->tokens, 0);
 	}
 
-	ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key);
+	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
 		goto fail;
@@ -178,12 +205,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 		goto fail;
 	}
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
-	}
-
 	return v;
 
 fail:
@@ -228,8 +249,10 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 
 	/* compile new variant if it doesn't exist already: */
 	v = create_variant(shader, key);
-	v->next = shader->variants;
-	shader->variants = v;
+	if (v) {
+		v->next = shader->variants;
+		shader->variants = v;
+	}
 
 	return v;
 }
@@ -249,13 +272,372 @@ ir3_shader_destroy(struct ir3_shader *shader)
 }
 
 struct ir3_shader *
-ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+ir3_shader_create(struct pipe_context *pctx,
+		const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
 	shader->compiler = fd_context(pctx)->screen->compiler;
+	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(tokens);
+	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	shader->stream_output = cso->stream_output;
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* if shader-db run, create a standard variant immediately
+		 * (as otherwise nothing will trigger the shader to be
+		 * actually compiled)
+		 */
+		static struct ir3_shader_key key = {};
+		ir3_shader_variant(shader, key);
+	}
 	return shader;
 }
+
+static void dump_reg(const char *name, uint32_t r)
+{
+	if (r != regid(63,0))
+		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_semantic(struct ir3_shader_variant *so,
+		unsigned sem, const char *name)
+{
+	uint32_t regid;
+	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
+	dump_reg(name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
+{
+	struct ir3 *ir = so->ir;
+	struct ir3_register *reg;
+	const char *type = ir3_shader_stage(so->shader);
+	uint8_t regid;
+	unsigned i;
+
+	for (i = 0; i < ir->ninputs; i++) {
+		if (!ir->inputs[i]) {
+			debug_printf("; in%d unused\n", i);
+			continue;
+		}
+		reg = ir->inputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@in(%sr%d.%c)\tin%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i]) {
+			debug_printf("; out%d unused\n", i);
+			continue;
+		}
+		/* kill shows up as a virtual output.. skip it! */
+		if (is_kill(ir->outputs[i]))
+			continue;
+		reg = ir->outputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@out(%sr%d.%c)\tout%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < so->immediates_count; i++) {
+		debug_printf("@const(c%d.x)\t", so->first_immediate + i);
+		debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+				so->immediates[i].val[0],
+				so->immediates[i].val[1],
+				so->immediates[i].val[2],
+				so->immediates[i].val[3]);
+	}
+
+	disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
+
+	debug_printf("; %s: outputs:", type);
+	for (i = 0; i < so->outputs_count; i++) {
+		uint8_t regid = so->outputs[i].regid;
+		ir3_semantic sem = so->outputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem));
+	}
+	debug_printf("\n");
+	debug_printf("; %s: inputs:", type);
+	for (i = 0; i < so->inputs_count; i++) {
+		uint8_t regid = so->inputs[i].regid;
+		ir3_semantic sem = so->inputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem),
+				so->inputs[i].compmask,
+				so->inputs[i].inloc,
+				so->inputs[i].bary);
+	}
+	debug_printf("\n");
+
+	/* print generic shader info: */
+	debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n",
+			type, so->shader->id, so->id,
+			so->info.instrs_count,
+			so->info.max_half_reg + 1,
+			so->info.max_reg + 1);
+
+	debug_printf("; %d const, %u constlen\n",
+			so->info.max_const + 1,
+			so->constlen);
+
+	/* print shader type specific info: */
+	switch (so->type) {
+	case SHADER_VERTEX:
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
+		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
+		break;
+	case SHADER_FRAGMENT:
+		dump_reg("pos (bary)", so->pos_regid);
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
+		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
+		/* these two are hard-coded since we don't know how to
+		 * program them to anything but all 0's...
+		 */
+		if (so->frag_coord)
+			debug_printf("; fragcoord: r0.x\n");
+		if (so->frag_face)
+			debug_printf("; fragface: hr0.x\n");
+		break;
+	case SHADER_COMPUTE:
+		break;
+	}
+
+	debug_printf("\n");
+}
+
+/* This has to reach into the fd_context a bit more than the rest of
+ * ir3, but it needs to be aligned with the compiler, so both agree
+ * on which const regs hold what.  And the logic is identical between
+ * a3xx/a4xx, the only difference is small details in the actual
+ * CP_LOAD_STATE packets (which is handled inside the generation
+ * specific ctx->emit_const(_bo)() fxns)
+ */
+
+#include "freedreno_resource.h"
+
+static void
+emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	const unsigned index = 0;     /* user consts are index 0 */
+	/* TODO save/restore dirty_mask for binning pass instead: */
+	uint32_t dirty_mask = constbuf->enabled_mask;
+
+	if (dirty_mask & (1 << index)) {
+		struct pipe_constant_buffer *cb = &constbuf->cb[index];
+		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
+
+		/* in particular, with binning shader we may end up with
+		 * unused consts, ie. we could end up w/ constlen that is
+		 * smaller than first_driver_param.  In that case truncate
+		 * the user consts early to avoid HLSQ lockup caused by
+		 * writing too many consts
+		 */
+		uint32_t max_const = MIN2(v->first_driver_param, v->constlen);
+
+		// I expect that size should be a multiple of vec4's:
+		assert(size == align(size, 4));
+
+		/* and even if the start of the const buffer is before
+		 * first_immediate, the end may not be:
+		 */
+		size = MIN2(size, 4 * max_const);
+
+		if (size > 0) {
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, v->type, 0,
+					cb->buffer_offset, size,
+					cb->user_buffer, cb->buffer);
+			constbuf->dirty_mask &= ~(1 << index);
+		}
+	}
+}
+
+static void
+emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+	if (v->constlen > offset) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		uint32_t params = MIN2(4, v->constlen - offset) * 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			const uint32_t index = i + 1;   /* UBOs start at index 1 */
+			struct pipe_constant_buffer *cb = &constbuf->cb[index];
+			assert(!cb->user_buffer);
+
+			if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) {
+				offsets[i] = cb->buffer_offset;
+				bos[i] = fd_resource(cb->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, bos, offsets);
+	}
+}
+
+static void
+emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	int size = v->immediates_count;
+	uint32_t base = v->first_immediate;
+
+	/* truncate size to avoid writing constants that shader
+	 * does not use:
+	 */
+	size = MIN2(size + base, v->constlen) - base;
+
+	/* convert out of vec4: */
+	base *= 4;
+	size *= 4;
+
+	if (size > 0) {
+		fd_wfi(ctx, ring);
+		ctx->emit_const(ring, v->type, base,
+			0, size, v->immediates[0].val, NULL);
+	}
+}
+
+/* emit stream-out buffers: */
+static void
+emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	uint32_t offset = v->first_driver_param + 5;  /* streamout addresses after driver-params*/
+	if (v->constlen > offset) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		struct fd_streamout_stateobj *so = &ctx->streamout;
+		struct pipe_stream_output_info *info = &v->shader->stream_output;
+		uint32_t params = 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			struct pipe_stream_output_target *target = so->targets[i];
+
+			if (target) {
+				offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
+						target->buffer_offset;
+				bos[i] = fd_resource(target->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets);
+	}
+}
+
+static uint32_t
+max_tf_vtx(struct ir3_shader_variant *v)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	struct pipe_stream_output_info *info = &v->shader->stream_output;
+	uint32_t maxvtxcnt = 0x7fffffff;
+
+	if (v->key.binning_pass)
+		return 0;
+	if (v->shader->stream_output.num_outputs == 0)
+		return 0;
+	if (so->num_targets == 0)
+		return 0;
+
+	/* offset to write to is:
+	 *
+	 *   total_vtxcnt = vtxcnt + offsets[i]
+	 *   offset = total_vtxcnt * stride[i]
+	 *
+	 *   offset =   vtxcnt * stride[i]       ; calculated in shader
+	 *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
+	 *
+	 * assuming for each vtx, each target buffer will have data written
+	 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
+	 *
+	 *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
+	 *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
+	 *
+	 * but shader is actually doing a less-than (rather than less-than-
+	 * equal) check, so we can drop the -stride[i].
+	 *
+	 * TODO is assumption about `offset + stride[i]` legit?
+	 */
+	for (unsigned i = 0; i < so->num_targets; i++) {
+		struct pipe_stream_output_target *target = so->targets[i];
+		unsigned stride = info->stride[i] * 4;   /* convert dwords->bytes */
+		if (target) {
+			uint32_t max = target->buffer_size / stride;
+			maxvtxcnt = MIN2(maxvtxcnt, max);
+		}
+	}
+
+	return maxvtxcnt;
+}
+
+void
+ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
+		struct fd_constbuf_stateobj *constbuf;
+		bool shader_dirty;
+
+		if (v->type == SHADER_VERTEX) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_VP);
+		} else if (v->type == SHADER_FRAGMENT) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_FP);
+		} else {
+			unreachable("bad shader type");
+			return;
+		}
+
+		emit_user_consts(v, ring, constbuf);
+		emit_ubos(v, ring, constbuf);
+		if (shader_dirty)
+			emit_immediates(v, ring);
+	}
+
+	/* emit driver params every time: */
+	/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
+	if (info && (v->type == SHADER_VERTEX)) {
+		uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
+		if (v->constlen >= offset) {
+			uint32_t vertex_params[4] = {
+				[IR3_DP_VTXID_BASE] = info->indexed ?
+						info->index_bias : info->start,
+				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
+			};
+
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
+					ARRAY_SIZE(vertex_params), vertex_params, NULL);
+
+			/* if needed, emit stream-out buffer addresses: */
+			if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
+				emit_tfbos(v, ring);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 9f1b0769180..1bbbdbd224d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -29,9 +29,22 @@
 #ifndef IR3_SHADER_H_
 #define IR3_SHADER_H_
 
+#include "pipe/p_state.h"
+
 #include "ir3.h"
 #include "disasm.h"
 
+/* driver param indices: */
+enum ir3_driver_param {
+	IR3_DP_VTXID_BASE = 0,
+	IR3_DP_VTXCNT_MAX = 1,
+};
+
+/* internal semantic used for passing vtxcnt to vertex shader to
+ * implement transform feedback:
+ */
+#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0)
+
 typedef uint16_t ir3_semantic;  /* semantic name + index */
 static inline ir3_semantic
 ir3_semantic_name(uint8_t name, uint16_t index)
@@ -100,6 +113,9 @@ ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
 struct ir3_shader_variant {
 	struct fd_bo *bo;
 
+	/* variant id (for debug) */
+	uint32_t id;
+
 	struct ir3_shader_key key;
 
 	struct ir3_info info;
@@ -192,26 +208,44 @@ struct ir3_shader_variant {
 struct ir3_shader {
 	enum shader_t type;
 
+	/* shader id (for debug): */
+	uint32_t id;
+	uint32_t variant_count;
+
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;
 	const struct tgsi_token *tokens;
+	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
-
-	/* so far, only used for blit_prog shader.. values for
-	 * VPC_VARYING_PS_REPL[i].MODE
-	 */
-	uint32_t vpsrepl[8];
 };
 
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 
 struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
-		const struct tgsi_token *tokens, enum shader_t type);
+		const struct pipe_shader_state *cso, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
+
+struct fd_ringbuffer;
+void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+	switch (shader->type) {
+	case SHADER_VERTEX:     return "VERT";
+	case SHADER_FRAGMENT:   return "FRAG";
+	case SHADER_COMPUTE:    return "CL";
+	default:
+		unreachable("invalid type");
+		return NULL;
+	}
+}
 
 /*
  * Helper/util:
diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h
index dcf63543219..6466fa594f9 100644
--- a/src/gallium/drivers/i915/i915_batchbuffer.h
+++ b/src/gallium/drivers/i915/i915_batchbuffer.h
@@ -33,20 +33,20 @@
 
 struct i915_context;
 
-static INLINE size_t
+static inline size_t
 i915_winsys_batchbuffer_space(struct i915_winsys_batchbuffer *batch)
 {
    return batch->size - (batch->ptr - batch->map);
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_batchbuffer_check(struct i915_winsys_batchbuffer *batch,
                               size_t dwords)
 {
    return dwords * 4 <= i915_winsys_batchbuffer_space(batch);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
                                         unsigned dword)
 {
@@ -54,7 +54,7 @@ i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
    batch->ptr += 4;
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
                               float f)
 {
@@ -64,7 +64,7 @@ i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, uif.ui);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
                               unsigned dword)
 {
@@ -72,7 +72,7 @@ i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, dword);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
                               void *data,
                               size_t size)
@@ -83,7 +83,7 @@ i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
    batch->ptr += size;
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
                              struct i915_winsys_buffer **buffers,
                              int num_of_buffers)
@@ -91,7 +91,7 @@ i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
    return batch->iws->validate_buffers(batch, buffers, num_of_buffers);
 }
 
-static INLINE int
+static inline int
 i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch,
                               struct i915_winsys_buffer *buffer,
                               enum i915_winsys_buffer_usage usage,
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 40abf3c577f..c8c7d64f5cb 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -339,7 +339,7 @@ struct i915_context {
 #define I915_DST_VARS                   4
 #define I915_DST_RECT                   8
 
-static INLINE
+static inline
 void i915_set_flush_dirty(struct i915_context *i915, unsigned flush)
 {
    i915->hardware_dirty |= I915_HW_FLUSH;
@@ -408,7 +408,7 @@ struct pipe_context *i915_create_context(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct i915_context *
+static inline struct i915_context *
 i915_context( struct pipe_context *pipe )
 {
    return (struct i915_context *)pipe;
diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
index 079882c811f..0f12a592ae8 100644
--- a/src/gallium/drivers/i915/i915_debug.h
+++ b/src/gallium/drivers/i915/i915_debug.h
@@ -48,13 +48,13 @@ struct i915_winsys_batchbuffer;
 extern unsigned i915_debug;
 
 #ifdef DEBUG
-static INLINE boolean
+static inline boolean
 I915_DBG_ON(unsigned flags)
 {
    return i915_debug & flags;
 }
 
-static INLINE void
+static inline void
 I915_DBG(unsigned flags, const char *fmt, ...)
 {
    if (I915_DBG_ON(flags)) {
@@ -67,7 +67,7 @@ I915_DBG(unsigned flags, const char *fmt, ...)
 }
 #else
 #define I915_DBG_ON(flags) (0)
-static INLINE void I915_DBG(unsigned flags, const char *fmt, ...) {}
+static inline void I915_DBG(unsigned flags, const char *fmt, ...) {}
 #endif
 
 void i915_debug_init(struct i915_screen *i915);
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index a4dbcb4d271..adc42542fea 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -136,7 +136,7 @@ struct i915_fp_compile {
 
 /* One neat thing about the UREG representation:  
  */
-static INLINE int
+static inline int
 swizzle(int reg, uint x, uint y, uint z, uint w)
 {
    assert(x <= SRC_ONE);
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 38a33888166..456be9d92ca 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -111,7 +111,7 @@ static const float cos_constants[4] = { 1.0,
 /**
  * component-wise negation of ureg
  */
-static INLINE int
+static inline int
 negate(int reg, int x, int y, int z, int w)
 {
    /* Another neat thing about the UREG representation */
diff --git a/src/gallium/drivers/i915/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c
index 248e21e02da..ea84efd1d17 100644
--- a/src/gallium/drivers/i915/i915_prim_emit.c
+++ b/src/gallium/drivers/i915/i915_prim_emit.c
@@ -53,7 +53,7 @@ struct setup_stage {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+static inline struct setup_stage *setup_stage( struct draw_stage *stage )
 {
    return (struct setup_stage *)stage;
 }
@@ -65,7 +65,7 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.
  */
-static INLINE void
+static inline void
 emit_hw_vertex( struct i915_context *i915,
                 const struct vertex_header *vertex)
 {
@@ -124,7 +124,7 @@ emit_hw_vertex( struct i915_context *i915,
 
 
 
-static INLINE void 
+static inline void 
 emit_prim( struct draw_stage *stage, 
 	   struct prim_header *prim,
 	   unsigned hwprim,
diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
index d134dbb1620..8f61f151e0c 100644
--- a/src/gallium/drivers/i915/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -96,7 +96,7 @@ struct i915_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct i915_vbuf_render *
+static inline struct i915_vbuf_render *
 i915_vbuf_render(struct vbuf_render *render)
 {
    assert(render);
diff --git a/src/gallium/drivers/i915/i915_resource.h b/src/gallium/drivers/i915/i915_resource.h
index ef99cfb5d3c..77fe8b70f79 100644
--- a/src/gallium/drivers/i915/i915_resource.h
+++ b/src/gallium/drivers/i915/i915_resource.h
@@ -94,14 +94,14 @@ void i915_init_resource_functions(struct i915_context *i915);
 extern struct u_resource_vtbl i915_buffer_vtbl;
 extern struct u_resource_vtbl i915_texture_vtbl;
 
-static INLINE struct i915_texture *i915_texture(struct pipe_resource *resource)
+static inline struct i915_texture *i915_texture(struct pipe_resource *resource)
 {
    struct i915_texture *tex = (struct i915_texture *)resource;
    assert(tex->b.vtbl == &i915_texture_vtbl);
    return tex;
 }
 
-static INLINE struct i915_buffer *i915_buffer(struct pipe_resource *resource)
+static inline struct i915_buffer *i915_buffer(struct pipe_resource *resource)
 {
    struct i915_buffer *tex = (struct i915_buffer *)resource;
    assert(tex->b.vtbl == &i915_buffer_vtbl);
diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c
index 8ef73d6f2c2..9a3279ccb75 100644
--- a/src/gallium/drivers/i915/i915_resource_texture.c
+++ b/src/gallium/drivers/i915/i915_resource_texture.c
@@ -89,25 +89,25 @@ static const int bottom_offsets[6] = {
    [PIPE_TEX_FACE_NEG_Z] = 16 + 5 * 8,
 };
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksx(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksx(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksy(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksy(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 get_pot_stride(enum pipe_format format, unsigned width)
 {
    return util_next_power_of_two(util_format_get_stride(format, width));
 }
 
-static INLINE const char*
+static inline const char*
 get_tiling_string(enum i915_winsys_buffer_tile tile)
 {
    switch(tile) {
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 0590da07b9a..19a94a8e019 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -243,6 +243,10 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -463,21 +467,15 @@ i915_fence_reference(struct pipe_screen *screen,
 }
 
 static boolean
-i915_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct i915_screen *is = i915_screen(screen);
-
-   return is->iws->fence_signalled(is->iws, fence) == 1;
-}
-
-static boolean
 i915_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
                   uint64_t timeout)
 {
    struct i915_screen *is = i915_screen(screen);
 
+   if (!timeout)
+      return is->iws->fence_signalled(is->iws, fence) == 1;
+
    return is->iws->fence_finish(is->iws, fence) == 1;
 }
 
@@ -565,7 +563,6 @@ i915_screen_create(struct i915_winsys *iws)
    is->base.context_create = i915_create_context;
 
    is->base.fence_reference = i915_fence_reference;
-   is->base.fence_signalled = i915_fence_signalled;
    is->base.fence_finish = i915_fence_finish;
 
    i915_init_screen_resource_functions(is);
diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
index 99d3ffd3af9..3be941a1561 100644
--- a/src/gallium/drivers/i915/i915_screen.h
+++ b/src/gallium/drivers/i915/i915_screen.h
@@ -59,7 +59,7 @@ struct i915_screen
  */
 
 
-static INLINE struct i915_screen *
+static inline struct i915_screen *
 i915_screen(struct pipe_screen *pscreen)
 {
    return (struct i915_screen *) pscreen;
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 4050cd4ac44..1c29e8ae671 100644
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -46,7 +46,7 @@
  * (active) state every time a 4kb boundary is crossed.
  */
 
-static INLINE void set_dynamic(struct i915_context *i915,
+static inline void set_dynamic(struct i915_context *i915,
                                unsigned offset,
                                const unsigned state)
 {
@@ -60,7 +60,7 @@ static INLINE void set_dynamic(struct i915_context *i915,
 
 
 
-static INLINE void set_dynamic_array(struct i915_context *i915,
+static inline void set_dynamic_array(struct i915_context *i915,
                                      unsigned offset,
                                      const unsigned *src,
                                      unsigned dwords)
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index d244a349fce..c4a6cae1beb 100644
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -39,7 +39,7 @@
 /* Convinience function to check immediate state.
  */
 
-static INLINE void set_immediate(struct i915_context *i915,
+static inline void set_immediate(struct i915_context *i915,
                                  unsigned offset,
                                  const unsigned state)
 {
diff --git a/src/gallium/drivers/i915/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h
index d4c5ab69555..015ea32933b 100644
--- a/src/gallium/drivers/i915/i915_state_inlines.h
+++ b/src/gallium/drivers/i915/i915_state_inlines.h
@@ -34,7 +34,7 @@
 #include "i915_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -59,7 +59,7 @@ i915_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_shadow_compare_func(unsigned func)
 {
    switch (func) {
@@ -84,7 +84,7 @@ i915_translate_shadow_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_stencil_op(unsigned op)
 {
    switch (op) {
@@ -109,7 +109,7 @@ i915_translate_stencil_op(unsigned op)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -148,7 +148,7 @@ i915_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_func(unsigned mode)
 {
    switch (mode) {
@@ -168,7 +168,7 @@ i915_translate_blend_func(unsigned mode)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_logic_op(unsigned opcode)
 {
    switch (opcode) {
@@ -211,7 +211,7 @@ i915_translate_logic_op(unsigned opcode)
 
 
 
-static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+static inline boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
 {
    boolean ok;
 
diff --git a/src/gallium/drivers/ilo/Makefile.am b/src/gallium/drivers/ilo/Makefile.am
index a8785a5e8c4..1f14153748e 100644
--- a/src/gallium/drivers/ilo/Makefile.am
+++ b/src/gallium/drivers/ilo/Makefile.am
@@ -21,8 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index e1bbb9a0781..7a7db938f92 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -1,5 +1,4 @@
 C_SOURCES := \
-	core/ilo_buffer.h \
 	core/ilo_builder.c \
 	core/ilo_builder.h \
 	core/ilo_builder_3d.h \
@@ -43,6 +42,7 @@ C_SOURCES := \
 	core/ilo_state_viewport.h \
 	core/ilo_state_zs.c \
 	core/ilo_state_zs.h \
+	core/ilo_vma.h \
 	core/intel_winsys.h \
 	ilo_blit.c \
 	ilo_blit.h \
@@ -65,8 +65,6 @@ C_SOURCES := \
 	ilo_public.h \
 	ilo_query.c \
 	ilo_query.h \
-	ilo_resource.c \
-	ilo_resource.h \
 	ilo_render.c \
 	ilo_render.h \
 	ilo_render_gen.h \
@@ -76,6 +74,8 @@ C_SOURCES := \
 	ilo_render_gen8.c \
 	ilo_render_media.c \
 	ilo_render_surface.c \
+	ilo_resource.c \
+	ilo_resource.h \
 	ilo_screen.c \
 	ilo_screen.h \
 	ilo_shader.c \
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 6d9e3699125..5efe9da2d22 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -39,6 +39,7 @@
 #include "ilo_state_shader.h"
 #include "ilo_state_viewport.h"
 #include "ilo_state_zs.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 #include "ilo_builder_3d_top.h"
 
@@ -674,9 +675,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->depth[0];
@@ -691,9 +693,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
       else
          dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -724,9 +727,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->stencil[0];
@@ -734,9 +738,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -767,9 +772,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->hiz[0];
@@ -777,9 +783,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 8d30095e6f6..6e94fb25f1f 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -39,6 +39,7 @@
 #include "ilo_state_surface.h"
 #include "ilo_state_urb.h"
 #include "ilo_state_vf.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 
 static inline void
@@ -318,8 +319,10 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
       dw[3] = 0;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         if (b->need_bo)
-            ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc64(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+         }
 
          dw[3] |= b->vb[2];
       } else {
@@ -331,9 +334,11 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
             dw[3] |= vf->user_instancing[elem][1];
          }
 
-         if (b->need_bo) {
-            ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0);
-            ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+            ilo_builder_batch_reloc(builder, pos + 2, b->vma->bo,
+                  b->vma->bo_offset + b->vb[2], 0);
          }
       }
 
@@ -429,9 +434,11 @@ gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = dw0;
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0);
-      ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc(builder, pos + 1, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
+      ilo_builder_batch_reloc(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[2], 0);
    } else {
       dw[1] = 0;
       dw[2] = 0;
@@ -456,8 +463,9 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    dw[1] = ib->ib[0] |
            builder->mocs << GEN8_IB_DW1_MOCS__SHIFT;
 
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -801,11 +809,11 @@ gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT |
            sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 2, sb->bo,
-            sb->so_buf[0], INTEL_RELOC_WRITE);
-      ilo_builder_batch_reloc(builder, pos + 3, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[0], INTEL_RELOC_WRITE);
+      ilo_builder_batch_reloc(builder, pos + 3, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -832,9 +840,9 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
            builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -842,9 +850,10 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
 
    dw[4] = sb->so_buf[2];
 
-   if (sb->need_write_offset_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo,
-            sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE);
+   if (sb->write_offset_vma) {
+      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_vma->bo,
+            sb->write_offset_vma->bo_offset + sizeof(uint32_t) * buffer,
+            INTEL_RELOC_WRITE);
    } else {
       dw[5] = 0;
       dw[6] = 0;
@@ -1254,14 +1263,15 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          const uint32_t mocs = (surf->scanout) ?
             (GEN8_MOCS_MT_PTE | GEN8_MOCS_CT_L3) : builder->mocs;
 
          dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo,
-               surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[8],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       state_align = 32;
@@ -1271,15 +1281,16 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          /*
           * For scanouts, we should not enable caching in LLC.  Since we only
           * enable that on Gen8+, we are fine here.
           */
          dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo,
-               surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc(builder, state_offset, 1, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[1],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index 0a7f7d9d3fe..da7db90a54b 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -29,15 +29,9 @@
 #define ILO_CORE_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_format.h"
 
 #include "util/u_debug.h"
-#include "util/list.h"
-#include "util/u_format.h"
-#include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pointer.h"
 
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 0d837d8a9d5..fa547ac5c36 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -40,269 +40,356 @@ enum {
                         IMAGE_TILING_W)
 };
 
-struct ilo_image_params {
-   const struct ilo_dev *dev;
-   const struct pipe_resource *templ;
-   unsigned valid_tilings;
+struct ilo_image_layout {
+   enum ilo_image_walk_type walk;
+   bool interleaved_samples;
 
-   bool compressed;
+   uint8_t valid_tilings;
+   enum gen_surface_tiling tiling;
 
-   unsigned h0, h1;
-   unsigned max_x, max_y;
+   enum ilo_image_aux_type aux;
+
+   int align_i;
+   int align_j;
+
+   struct ilo_image_lod *lods;
+   int walk_layer_h0;
+   int walk_layer_h1;
+   int walk_layer_height;
+   int monolithic_width;
+   int monolithic_height;
 };
 
-static void
-img_get_slice_size(const struct ilo_image *img,
-                   const struct ilo_image_params *params,
-                   unsigned level, unsigned *width, unsigned *height)
+static enum ilo_image_walk_type
+image_get_gen6_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned w, h;
+   ILO_DEV_ASSERT(dev, 6, 6);
 
-   w = u_minify(img->width0, level);
-   h = u_minify(img->height0, level);
+   /* TODO we want LODs to be page-aligned */
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
 
    /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 114:
+    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
     *
-    *     "The dimensions of the mip maps are first determined by applying the
-    *      sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then,
-    *      if necessary, they are padded out to compression block boundaries."
+    *     "The separate stencil buffer does not support mip mapping, thus the
+    *      storage for LODs other than LOD 0 is not needed. The following
+    *      QPitch equation applies only to the separate stencil buffer:
+    *
+    *        QPitch = h_0"
+    *
+    * Use ILO_IMAGE_WALK_LOD and manually offset to the (page-aligned) levels
+    * when bound.
     */
-   w = align(w, img->block_width);
-   h = align(h, img->block_height);
+   if (info->bind_zs && info->format == GEN6_FORMAT_R8_UINT)
+      return ILO_IMAGE_WALK_LOD;
+
+   /* compact spacing is not supported otherwise */
+   return ILO_IMAGE_WALK_LAYER;
+}
+
+static enum ilo_image_walk_type
+image_get_gen7_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
 
    /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 111:
-    *
-    *     "If the surface is multisampled (4x), these values must be adjusted
-    *      as follows before proceeding:
+    * From the Ivy Bridge PRM, volume 1 part 1, page 111:
     *
-    *        W_L = ceiling(W_L / 2) * 4
-    *        H_L = ceiling(H_L / 2) * 4"
+    *     "note that the depth buffer and stencil buffer have an implied value
+    *      of ARYSPC_FULL"
     *
-    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
+    * From the Ivy Bridge PRM, volume 4 part 1, page 66:
     *
-    *     "If the surface is multisampled and it is a depth or stencil surface
-    *      or Multisampled Surface StorageFormat in SURFACE_STATE is
-    *      MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before
-    *      proceeding:
+    *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number of
+    *      Multisamples is not MULTISAMPLECOUNT_1, this field (Surface Array
+    *      Spacing) must be set to ARYSPC_LOD0."
+    */
+   if (info->sample_count > 1)
+      assert(info->level_count == 1);
+   return (info->bind_zs || info->level_count > 1) ?
+      ILO_IMAGE_WALK_LAYER : ILO_IMAGE_WALK_LOD;
+}
+
+static bool
+image_get_gen6_interleaved_samples(const struct ilo_dev *dev,
+                                   const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * Gen6 supports only interleaved samples.  It is not explicitly stated,
+    * but on Gen7+, render targets are expected to be UMS/CMS (samples
+    * non-interleaved) and depth/stencil buffers are expected to be IMS
+    * (samples interleaved).
     *
-    *        #samples  W_L =                    H_L =
-    *        2         ceiling(W_L / 2) * 4     HL [no adjustment]
-    *        4         ceiling(W_L / 2) * 4     ceiling(H_L / 2) * 4
-    *        8         ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 4
-    *        16        ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 8"
+    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
+    */
+   return (ilo_dev_gen(dev) == ILO_GEN(6) || info->bind_zs);
+}
+
+static uint8_t
+image_get_gen6_valid_tilings(const struct ilo_dev *dev,
+                             const struct ilo_image_info *info)
+{
+   uint8_t valid_tilings = IMAGE_TILING_ALL;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->valid_tilings)
+      valid_tilings &= info->valid_tilings;
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
     *
-    * For interleaved samples (4x), where pixels
+    *     "Display/Overlay   Y-Major not supported.
+    *                        X-Major required for Async Flips"
+    */
+   if (unlikely(info->bind_scanout))
+      valid_tilings &= IMAGE_TILING_X;
+
+   /*
+    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
     *
-    *   (x, y  ) (x+1, y  )
-    *   (x, y+1) (x+1, y+1)
+    *     "The cursor surface address must be 4K byte aligned. The cursor must
+    *      be in linear memory, it cannot be tiled."
+    */
+   if (unlikely(info->bind_cursor))
+      valid_tilings &= IMAGE_TILING_NONE;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
     *
-    * would be is occupied by
+    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
+    *      Depth Buffer is not supported."
     *
-    *   (x, y  , si0) (x+1, y  , si0) (x, y  , si1) (x+1, y  , si1)
-    *   (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1)
-    *   (x, y  , si2) (x+1, y  , si2) (x, y  , si3) (x+1, y  , si3)
-    *   (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3)
+    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
     *
-    * Thus the need to
+    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
     *
-    *   w = align(w, 2) * 2;
-    *   y = align(y, 2) * 2;
+    *     "W-Major Tile Format is used for separate stencil."
     */
-   if (img->interleaved_samples) {
-      switch (templ->nr_samples) {
-      case 0:
-      case 1:
-         break;
-      case 2:
-         w = align(w, 2) * 2;
-         break;
-      case 4:
-         w = align(w, 2) * 2;
-         h = align(h, 2) * 2;
-         break;
-      case 8:
-         w = align(w, 2) * 4;
-         h = align(h, 2) * 2;
-         break;
-      case 16:
-         w = align(w, 2) * 4;
-         h = align(h, 2) * 4;
-         break;
-      default:
-         assert(!"unsupported sample count");
-         break;
-      }
+   if (info->bind_zs) {
+      if (info->format == GEN6_FORMAT_R8_UINT)
+         valid_tilings &= IMAGE_TILING_W;
+      else
+         valid_tilings &= IMAGE_TILING_Y;
    }
 
-   /*
-    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
-    *
-    *     "For separate stencil buffer, the width must be mutiplied by 2 and
-    *      height divided by 2..."
-    *
-    * To make things easier (for transfer), we will just double the stencil
-    * stride in 3DSTATE_STENCIL_BUFFER.
-    */
-   w = align(w, img->align_i);
-   h = align(h, img->align_j);
+   if (info->bind_surface_sampler ||
+       info->bind_surface_dp_render ||
+       info->bind_surface_dp_typed) {
+      /*
+       * From the Haswell PRM, volume 2d, page 233:
+       *
+       *     "If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+       *      (Tiled Surface) must be TRUE."
+       */
+      if (info->sample_count > 1)
+         valid_tilings &= ~IMAGE_TILING_NONE;
 
-   *width = w;
-   *height = h;
-}
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         valid_tilings &= ~IMAGE_TILING_W;
+   }
 
-static unsigned
-img_get_num_layers(const struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   unsigned num_layers = templ->array_size;
+   if (info->bind_surface_dp_render) {
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
+       *
+       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
+       *      either TileX or Linear."
+       *
+       * From the Haswell PRM, volume 5, page 32:
+       *
+       *     "NOTE: 128 BPP format color buffer (render target) supports
+       *      Linear, TiledX and TiledY."
+       */
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->block_size == 16)
+         valid_tilings &= ~IMAGE_TILING_Y;
 
-   /* samples of the same index are stored in a layer */
-   if (templ->nr_samples > 1 && !img->interleaved_samples)
-      num_layers *= templ->nr_samples;
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+       *
+       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
+       *      for all tiled Y Render Target surfaces."
+       *
+       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
+       *
+       * R32G32B32_FLOAT is not renderable and we only need an assert() here.
+       */
+      if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
+   }
 
-   return num_layers;
+   return valid_tilings;
 }
 
-static void
-img_init_layer_height(struct ilo_image *img,
-                      struct ilo_image_params *params)
+static uint64_t
+image_get_gen6_estimated_size(const struct ilo_dev *dev,
+                              const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned num_layers;
+   /* padding not considered */
+   const uint64_t slice_size = info->width * info->height *
+      info->block_size / (info->block_width * info->block_height);
+   const uint64_t slice_count =
+      info->depth * info->array_size * info->sample_count;
+   const uint64_t estimated_size = slice_size * slice_count;
 
-   if (img->walk != ILO_IMAGE_WALK_LAYER)
-      return;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   num_layers = img_get_num_layers(img, params);
-   if (num_layers <= 1)
-      return;
+   if (info->level_count == 1)
+      return estimated_size;
+   else
+      return estimated_size * 4 / 3;
+}
+
+static enum gen_surface_tiling
+image_get_gen6_tiling(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      uint8_t valid_tilings)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (valid_tilings) {
+   case IMAGE_TILING_NONE:
+      return GEN6_TILING_NONE;
+   case IMAGE_TILING_X:
+      return GEN6_TILING_X;
+   case IMAGE_TILING_Y:
+      return GEN6_TILING_Y;
+   case IMAGE_TILING_W:
+      return GEN8_TILING_W;
+   default:
+      break;
+   }
 
    /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
-    *
-    *     "The following equation is used for surface formats other than
-    *      compressed textures:
-    *
-    *        QPitch = (h0 + h1 + 11j)"
-    *
-    *     "The equation for compressed textures (BC* and FXT1 surface formats)
-    *      follows:
-    *
-    *        QPitch = (h0 + h1 + 11j) / 4"
-    *
-    *     "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the
-    *      value calculated in the equation above, for every other odd Surface
-    *      Height starting from 1 i.e. 1,5,9,13"
-    *
-    * From the Ivy Bridge PRM, volume 1 part 1, page 111-112:
+    * X-tiling has the property that vertically adjacent pixels are usually in
+    * the same page.  When the image size is less than a page, the image
+    * height is 1, or when the image is not accessed in blocks, there is no
+    * reason to tile.
     *
-    *     "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth
-    *      buffer and stencil buffer have an implied value of ARYSPC_FULL):
-    *
-    *        QPitch = (h0 + h1 + 12j)
-    *        QPitch = (h0 + h1 + 12j) / 4 (compressed)
-    *
-    *      (There are many typos or missing words here...)"
-    *
-    * To access the N-th slice, an offset of (Stride * QPitch * N) is added to
-    * the base address.  The PRM divides QPitch by 4 for compressed formats
-    * because the block height for those formats are 4, and it wants QPitch to
-    * mean the number of memory rows, as opposed to texel rows, between
-    * slices.  Since we use texel rows everywhere, we do not need to divide
-    * QPitch by 4.
+    * Y-tiling is similar, where vertically adjacent pixels are usually in the
+    * same cacheline.
     */
-   img->walk_layer_height = params->h0 + params->h1 +
-      ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * img->align_j;
+   if (valid_tilings & IMAGE_TILING_NONE) {
+      const uint64_t estimated_size =
+         image_get_gen6_estimated_size(dev, info);
 
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) && templ->nr_samples > 1 &&
-       img->height0 % 4 == 1)
-      img->walk_layer_height += 4;
+      if (info->height == 1 || !(info->bind_surface_sampler ||
+                                 info->bind_surface_dp_render ||
+                                 info->bind_surface_dp_typed))
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 64 ||
+          estimated_size > info->prefer_linear_threshold)
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 2048)
+         valid_tilings &= ~IMAGE_TILING_X;
+   }
 
-   params->max_y += img->walk_layer_height * (num_layers - 1);
+   return (valid_tilings & IMAGE_TILING_Y) ? GEN6_TILING_Y :
+          (valid_tilings & IMAGE_TILING_X) ? GEN6_TILING_X :
+          GEN6_TILING_NONE;
 }
 
-static void
-img_init_lods(struct ilo_image *img,
-              struct ilo_image_params *params)
+static bool
+image_get_gen6_hiz_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned cur_x, cur_y;
-   unsigned lv;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   cur_x = 0;
-   cur_y = 0;
-   for (lv = 0; lv <= templ->last_level; lv++) {
-      unsigned lod_w, lod_h;
+   /* depth buffer? */
+   if (!info->bind_zs ||
+       info->format == GEN6_FORMAT_R8_UINT ||
+       info->interleaved_stencil)
+      return false;
 
-      img_get_slice_size(img, params, lv, &lod_w, &lod_h);
+   /* we want to be able to force 8x4 alignments */
+   if (info->type == GEN6_SURFTYPE_1D)
+      return false;
 
-      img->lods[lv].x = cur_x;
-      img->lods[lv].y = cur_y;
-      img->lods[lv].slice_width = lod_w;
-      img->lods[lv].slice_height = lod_h;
+   if (info->aux_disable)
+      return false;
 
-      switch (img->walk) {
-      case ILO_IMAGE_WALK_LAYER:
-         /* MIPLAYOUT_BELOW */
-         if (lv == 1)
-            cur_x += lod_w;
-         else
-            cur_y += lod_h;
-         break;
-      case ILO_IMAGE_WALK_LOD:
-         lod_h *= img_get_num_layers(img, params);
-         if (lv == 1)
-            cur_x += lod_w;
-         else
-            cur_y += lod_h;
+   if (ilo_debug & ILO_DEBUG_NOHIZ)
+      return false;
 
-         /* every LOD begins at tile boundaries */
-         if (templ->last_level > 0) {
-            assert(img->format == PIPE_FORMAT_S8_UINT);
-            cur_x = align(cur_x, 64);
-            cur_y = align(cur_y, 64);
-         }
-         break;
-      case ILO_IMAGE_WALK_3D:
-         {
-            const unsigned num_slices = u_minify(templ->depth0, lv);
-            const unsigned num_slices_per_row = 1 << lv;
-            const unsigned num_rows =
-               (num_slices + num_slices_per_row - 1) / num_slices_per_row;
+   return true;
+}
 
-            lod_w *= num_slices_per_row;
-            lod_h *= num_rows;
+static bool
+image_get_gen7_mcs_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
 
-            cur_y += lod_h;
-         }
-         break;
-      }
+   if (!info->bind_surface_sampler && !info->bind_surface_dp_render)
+      return false;
 
-      if (params->max_x < img->lods[lv].x + lod_w)
-         params->max_x = img->lods[lv].x + lod_w;
-      if (params->max_y < img->lods[lv].y + lod_h)
-         params->max_y = img->lods[lv].y + lod_h;
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
+    *
+    *     "For Render Target and Sampling Engine Surfaces:If the surface is
+    *      multisampled (Number of Multisamples any value other than
+    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
+    *
+    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
+    *      are not written"
+    */
+   if (info->sample_count > 1) {
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         assert(!info->is_integer);
+      return true;
    }
 
-   if (img->walk == ILO_IMAGE_WALK_LAYER) {
-      params->h0 = img->lods[0].slice_height;
+   if (info->aux_disable)
+      return false;
 
-      if (templ->last_level > 0)
-         params->h1 = img->lods[1].slice_height;
-      else
-         img_get_slice_size(img, params, 1, &cur_x, &params->h1);
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 326:
+    *
+    *     "When MCS is buffer is used for color clear of non-multisampler
+    *      render target, the following restrictions apply.
+    *      - Support is limited to tiled render targets.
+    *      - Support is for non-mip-mapped and non-array surface types only.
+    *      - Clear is supported only on the full RT; i.e., no partial clear or
+    *        overlapping clears.
+    *      - MCS buffer for non-MSRT is supported only for RT formats 32bpp,
+    *        64bpp and 128bpp.
+    *      ..."
+    *
+    * How about SURFTYPE_3D?
+    */
+   if (!info->bind_surface_dp_render ||
+       tiling == GEN6_TILING_NONE ||
+       info->level_count > 1 ||
+       info->array_size > 1)
+      return false;
+
+   switch (info->block_size) {
+   case 4:
+   case 8:
+   case 16:
+      return true;
+   default:
+      return false;
    }
 }
 
 static void
-img_init_alignments(struct ilo_image *img,
-                    const struct ilo_image_params *params)
+image_get_gen6_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          int *align_i, int *align_j)
 {
-   const struct pipe_resource *templ = params->templ;
+   ILO_DEV_ASSERT(dev, 6, 6);
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 113:
@@ -335,13 +422,33 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *   compressed formats             block width    block height
-    *   PIPE_FORMAT_S8_UINT            4              2
+    *   GEN6_FORMAT_R8_UINT            4              2
     *   other depth/stencil formats    4              4
     *   4x multisampled                4              4
     *   bpp 96                         4              2
     *   others                         4              2 or 4
     */
 
+   *align_i = (info->compressed) ? info->block_width : 4;
+   if (info->compressed) {
+      *align_j = info->block_height;
+   } else if (info->bind_zs) {
+      *align_j = (info->format == GEN6_FORMAT_R8_UINT) ? 2 : 4;
+   } else {
+      *align_j = (info->sample_count > 1 || info->block_size != 12) ? 4 : 2;
+   }
+}
+
+static void
+image_get_gen7_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling,
+                          int *align_i, int *align_j)
+{
+   int i, j;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
    /*
     * From the Ivy Bridge PRM, volume 1 part 1, page 110:
     *
@@ -383,465 +490,301 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *  compressed formats              block width    block height
-    *  PIPE_FORMAT_Z16_UNORM           8              4
-    *  PIPE_FORMAT_S8_UINT             8              8
+    *  GEN6_FORMAT_R16_UNORM           8              4
+    *  GEN6_FORMAT_R8_UINT             8              8
     *  other depth/stencil formats     4              4
     *  2x or 4x multisampled           4 or 8         4
     *  tiled Y                         4 or 8         4 (if rt)
-    *  PIPE_FORMAT_R32G32B32_FLOAT     4 or 8         2
+    *  GEN6_FORMAT_R32G32B32_FLOAT     4 or 8         2
     *  others                          4 or 8         2 or 4
     */
-
-   if (params->compressed) {
-      /* this happens to be the case */
-      img->align_i = img->block_width;
-      img->align_j = img->block_height;
-   } else if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) {
-         switch (img->format) {
-         case PIPE_FORMAT_Z16_UNORM:
-            img->align_i = 8;
-            img->align_j = 4;
-            break;
-         case PIPE_FORMAT_S8_UINT:
-            img->align_i = 8;
-            img->align_j = 8;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
-      } else {
-         switch (img->format) {
-         case PIPE_FORMAT_S8_UINT:
-            img->align_i = 4;
-            img->align_j = 2;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
+   if (info->compressed) {
+      i = info->block_width;
+      j = info->block_height;
+   } else if (info->bind_zs) {
+      switch (info->format) {
+      case GEN6_FORMAT_R16_UNORM:
+         i = 8;
+         j = 4;
+         break;
+      case GEN6_FORMAT_R8_UINT:
+         i = 8;
+         j = 8;
+         break;
+      default:
+         i = 4;
+         j = 4;
+         break;
       }
    } else {
       const bool valign_4 =
-         (templ->nr_samples > 1) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(8)) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          img->tiling == GEN6_TILING_Y &&
-          (templ->bind & PIPE_BIND_RENDER_TARGET));
-
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4)
-         assert(img->format != PIPE_FORMAT_R32G32B32_FLOAT);
-
-      img->align_i = 4;
-      img->align_j = (valign_4) ? 4 : 2;
-   }
+         (info->sample_count > 1 || ilo_dev_gen(dev) >= ILO_GEN(8) ||
+          (tiling == GEN6_TILING_Y && info->bind_surface_dp_render));
 
-   /*
-    * the fact that align i and j are multiples of block width and height
-    * respectively is what makes the size of the bo a multiple of the block
-    * size, slices start at block boundaries, and many of the computations
-    * work.
-    */
-   assert(img->align_i % img->block_width == 0);
-   assert(img->align_j % img->block_height == 0);
+      if (ilo_dev_gen(dev) < ILO_GEN(8) && valign_4)
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
 
-   /* make sure align() works */
-   assert(util_is_power_of_two(img->align_i) &&
-          util_is_power_of_two(img->align_j));
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
+      i = 4;
+      j = (valign_4) ? 4 : 2;
+   }
+
+   *align_i = i;
+   *align_j = j;
 }
 
-static void
-img_init_tiling(struct ilo_image *img,
-                const struct ilo_image_params *params)
+static bool
+image_init_gen6_hardware_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned preferred_tilings = params->valid_tilings;
-
-   /* no fencing nor BLT support */
-   if (preferred_tilings & ~IMAGE_TILING_W)
-      preferred_tilings &= ~IMAGE_TILING_W;
-
-   if (templ->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) {
-      /*
-       * heuristically set a minimum width/height for enabling tiling
-       */
-      if (img->width0 < 64 && (preferred_tilings & ~IMAGE_TILING_X))
-         preferred_tilings &= ~IMAGE_TILING_X;
-
-      if ((img->width0 < 32 || img->height0 < 16) &&
-          (img->width0 < 16 || img->height0 < 32) &&
-          (preferred_tilings & ~IMAGE_TILING_Y))
-         preferred_tilings &= ~IMAGE_TILING_Y;
-   } else {
-      /* force linear if we are not sure where the texture is bound to */
-      if (preferred_tilings & IMAGE_TILING_NONE)
-         preferred_tilings &= IMAGE_TILING_NONE;
-   }
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   /* prefer tiled over linear */
-   if (preferred_tilings & IMAGE_TILING_Y)
-      img->tiling = GEN6_TILING_Y;
-   else if (preferred_tilings & IMAGE_TILING_X)
-      img->tiling = GEN6_TILING_X;
-   else if (preferred_tilings & IMAGE_TILING_W)
-      img->tiling = GEN8_TILING_W;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      layout->walk = image_get_gen7_walk(dev, info);
    else
-      img->tiling = GEN6_TILING_NONE;
-}
+      layout->walk = image_get_gen6_walk(dev, info);
 
-static void
-img_init_walk_gen7(struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
+   layout->interleaved_samples =
+      image_get_gen6_interleaved_samples(dev, info);
 
-   /*
-    * It is not explicitly states, but render targets are expected to be
-    * UMS/CMS (samples non-interleaved) and depth/stencil buffers are expected
-    * to be IMS (samples interleaved).
-    *
-    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
-    */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      /*
-       * From the Ivy Bridge PRM, volume 1 part 1, page 111:
-       *
-       *     "note that the depth buffer and stencil buffer have an implied
-       *      value of ARYSPC_FULL"
-       */
-      img->walk = (templ->target == PIPE_TEXTURE_3D) ?
-         ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER;
+   layout->valid_tilings = image_get_gen6_valid_tilings(dev, info);
+   if (!layout->valid_tilings)
+      return false;
 
-      img->interleaved_samples = true;
-   } else {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 66:
-       *
-       *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number
-       *      of Multisamples is not MULTISAMPLECOUNT_1, this field (Surface
-       *      Array Spacing) must be set to ARYSPC_LOD0."
-       *
-       * As multisampled resources are not mipmapped, we never use
-       * ARYSPC_FULL for them.
-       */
-      if (templ->nr_samples > 1)
-         assert(templ->last_level == 0);
+   layout->tiling = image_get_gen6_tiling(dev, info, layout->valid_tilings);
 
-      img->walk =
-         (templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
-         (templ->last_level > 0) ? ILO_IMAGE_WALK_LAYER :
-         ILO_IMAGE_WALK_LOD;
+   if (image_get_gen6_hiz_enable(dev, info))
+      layout->aux = ILO_IMAGE_AUX_HIZ;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7) &&
+            image_get_gen7_mcs_enable(dev, info, layout->tiling))
+      layout->aux = ILO_IMAGE_AUX_MCS;
+   else
+      layout->aux = ILO_IMAGE_AUX_NONE;
 
-      img->interleaved_samples = false;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      image_get_gen7_alignments(dev, info, layout->tiling,
+            &layout->align_i, &layout->align_j);
+   } else {
+      image_get_gen6_alignments(dev, info,
+            &layout->align_i, &layout->align_j);
    }
+
+   return true;
 }
 
-static void
-img_init_walk_gen6(struct ilo_image *img,
-                   const struct ilo_image_params *params)
+static bool
+image_init_gen6_transfer_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
 {
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
-    *
-    *     "The separate stencil buffer does not support mip mapping, thus the
-    *      storage for LODs other than LOD 0 is not needed. The following
-    *      QPitch equation applies only to the separate stencil buffer:
-    *
-    *        QPitch = h_0"
-    *
-    * GEN6 does not support compact spacing otherwise.
-    */
-   img->walk =
-      (params->templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
-      (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD :
-      ILO_IMAGE_WALK_LAYER;
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* we can define our own layout to save space */
+   layout->walk = ILO_IMAGE_WALK_LOD;
+   layout->interleaved_samples = false;
+   layout->valid_tilings = IMAGE_TILING_NONE;
+   layout->tiling = GEN6_TILING_NONE;
+   layout->aux = ILO_IMAGE_AUX_NONE;
+   layout->align_i = info->block_width;
+   layout->align_j = info->block_height;
 
-   /* GEN6 supports only interleaved samples */
-   img->interleaved_samples = true;
+   return true;
 }
 
 static void
-img_init_walk(struct ilo_image *img,
-              const struct ilo_image_params *params)
+image_get_gen6_slice_size(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          const struct ilo_image_layout *layout,
+                          uint8_t level,
+                          int *width, int *height)
 {
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      img_init_walk_gen7(img, params);
-   else
-      img_init_walk_gen6(img, params);
-}
+   int w, h;
 
-static unsigned
-img_get_valid_tilings(const struct ilo_image *img,
-                      const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   const enum pipe_format format = img->format;
-   unsigned valid_tilings = params->valid_tilings;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-    *
-    *     "Display/Overlay   Y-Major not supported.
-    *                        X-Major required for Async Flips"
-    */
-   if (unlikely(templ->bind & PIPE_BIND_SCANOUT))
-      valid_tilings &= IMAGE_TILING_X;
+   w = u_minify(info->width, level);
+   h = u_minify(info->height, level);
 
    /*
-    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
+    * From the Sandy Bridge PRM, volume 1 part 1, page 114:
     *
-    *     "The cursor surface address must be 4K byte aligned. The cursor must
-    *      be in linear memory, it cannot be tiled."
+    *     "The dimensions of the mip maps are first determined by applying the
+    *      sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then,
+    *      if necessary, they are padded out to compression block boundaries."
     */
-   if (unlikely(templ->bind & (PIPE_BIND_CURSOR | PIPE_BIND_LINEAR)))
-      valid_tilings &= IMAGE_TILING_NONE;
+   w = align(w, info->block_width);
+   h = align(h, info->block_height);
 
    /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
+    * From the Sandy Bridge PRM, volume 1 part 1, page 111:
     *
-    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
-    *      Depth Buffer is not supported."
+    *     "If the surface is multisampled (4x), these values must be adjusted
+    *      as follows before proceeding:
     *
-    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
+    *        W_L = ceiling(W_L / 2) * 4
+    *        H_L = ceiling(H_L / 2) * 4"
     *
-    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
+    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
     *
-    *     "W-Major Tile Format is used for separate stencil."
+    *     "If the surface is multisampled and it is a depth or stencil surface
+    *      or Multisampled Surface StorageFormat in SURFACE_STATE is
+    *      MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before
+    *      proceeding:
+    *
+    *        #samples  W_L =                    H_L =
+    *        2         ceiling(W_L / 2) * 4     HL [no adjustment]
+    *        4         ceiling(W_L / 2) * 4     ceiling(H_L / 2) * 4
+    *        8         ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 4
+    *        16        ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 8"
+    *
+    * For interleaved samples (4x), where pixels
+    *
+    *   (x, y  ) (x+1, y  )
+    *   (x, y+1) (x+1, y+1)
+    *
+    * would be is occupied by
+    *
+    *   (x, y  , si0) (x+1, y  , si0) (x, y  , si1) (x+1, y  , si1)
+    *   (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1)
+    *   (x, y  , si2) (x+1, y  , si2) (x, y  , si3) (x+1, y  , si3)
+    *   (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3)
+    *
+    * Thus the need to
+    *
+    *   w = align(w, 2) * 2;
+    *   y = align(y, 2) * 2;
     */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      switch (format) {
-      case PIPE_FORMAT_S8_UINT:
-         valid_tilings &= IMAGE_TILING_W;
+   if (layout->interleaved_samples) {
+      switch (info->sample_count) {
+      case 1:
+         break;
+      case 2:
+         w = align(w, 2) * 2;
+         break;
+      case 4:
+         w = align(w, 2) * 2;
+         h = align(h, 2) * 2;
+         break;
+      case 8:
+         w = align(w, 2) * 4;
+         h = align(h, 2) * 2;
+         break;
+      case 16:
+         w = align(w, 2) * 4;
+         h = align(h, 2) * 4;
          break;
       default:
-         valid_tilings &= IMAGE_TILING_Y;
+         assert(!"unsupported sample count");
          break;
       }
    }
 
-   if (templ->bind & PIPE_BIND_RENDER_TARGET) {
-      /*
-       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-       *
-       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
-       *      either TileX or Linear."
-       *
-       * From the Haswell PRM, volume 5, page 32:
-       *
-       *     "NOTE: 128 BPP format color buffer (render target) supports
-       *      Linear, TiledX and TiledY."
-       */
-      if (ilo_dev_gen(params->dev) < ILO_GEN(7.5) && img->block_size == 16)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-       *
-       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
-       *      for all tiled Y Render Target surfaces."
-       *
-       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
-       */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) &&
-          img->format == PIPE_FORMAT_R32G32B32_FLOAT)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
-      if (ilo_dev_gen(params->dev) < ILO_GEN(8))
-         valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   /* no conflicting binding flags */
-   assert(valid_tilings);
-
-   return valid_tilings;
-}
-
-static void
-img_init_size_and_format(struct ilo_image *img,
-                         struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   enum pipe_format format = templ->format;
-   bool require_separate_stencil = false;
-
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
-   img->level_count = templ->last_level + 1;
-   img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
-
    /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 317:
+    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
     *
-    *     "This field (Separate Stencil Buffer Enable) must be set to the same
-    *      value (enabled or disabled) as Hierarchical Depth Buffer Enable."
+    *     "For separate stencil buffer, the width must be mutiplied by 2 and
+    *      height divided by 2..."
     *
-    * GEN7+ requires separate stencil buffers.
+    * To make things easier (for transfer), we will just double the stencil
+    * stride in 3DSTATE_STENCIL_BUFFER.
     */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-         require_separate_stencil = true;
-      else
-         require_separate_stencil = (img->aux.type == ILO_IMAGE_AUX_HIZ);
-   }
-
-   switch (format) {
-   case PIPE_FORMAT_ETC1_RGB8:
-      format = PIPE_FORMAT_R8G8B8X8_UNORM;
-      break;
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z24X8_UNORM;
-         img->separate_stencil = true;
-      }
-      break;
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z32_FLOAT;
-         img->separate_stencil = true;
-      }
-      break;
-   default:
-      break;
-   }
+   w = align(w, layout->align_i);
+   h = align(h, layout->align_j);
 
-   img->format = format;
-   img->block_width = util_format_get_blockwidth(format);
-   img->block_height = util_format_get_blockheight(format);
-   img->block_size = util_format_get_blocksize(format);
-
-   params->valid_tilings = img_get_valid_tilings(img, params);
-   params->compressed = util_format_is_compressed(img->format);
+   *width = w;
+   *height = h;
 }
 
-static bool
-img_want_mcs(const struct ilo_image *img,
-             const struct ilo_image_params *params)
+static int
+image_get_gen6_layer_count(const struct ilo_dev *dev,
+                           const struct ilo_image_info *info,
+                           const struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   bool want_mcs = false;
+   int count = info->array_size;
 
-   /* MCS is for RT on GEN7+ */
-   if (ilo_dev_gen(params->dev) < ILO_GEN(7))
-      return false;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (templ->target != PIPE_TEXTURE_2D ||
-       !(templ->bind & PIPE_BIND_RENDER_TARGET))
-      return false;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
-    *
-    *     "For Render Target and Sampling Engine Surfaces:If the surface is
-    *      multisampled (Number of Multisamples any value other than
-    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
-    *
-    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
-    *      are not written"
-    */
-   if (templ->nr_samples > 1 && !util_format_is_pure_sint(templ->format)) {
-      want_mcs = true;
-   } else if (templ->nr_samples <= 1) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 326:
-       *
-       *     "When MCS is buffer is used for color clear of non-multisampler
-       *      render target, the following restrictions apply.
-       *      - Support is limited to tiled render targets.
-       *      - Support is for non-mip-mapped and non-array surface types
-       *        only.
-       *      - Clear is supported only on the full RT; i.e., no partial clear
-       *        or overlapping clears.
-       *      - MCS buffer for non-MSRT is supported only for RT formats
-       *        32bpp, 64bpp and 128bpp.
-       *      ..."
-       */
-      if (img->tiling != GEN6_TILING_NONE &&
-          templ->last_level == 0 && templ->array_size == 1) {
-         switch (img->block_size) {
-         case 4:
-         case 8:
-         case 16:
-            want_mcs = true;
-            break;
-         default:
-            break;
-         }
-      }
-   }
+   /* samples of the same index are stored in a layer */
+   if (!layout->interleaved_samples)
+      count *= info->sample_count;
 
-   return want_mcs;
+   return count;
 }
 
-static bool
-img_want_hiz(const struct ilo_image *img,
-             const struct ilo_image_params *params)
+static void
+image_get_gen6_walk_layer_heights(const struct ilo_dev *dev,
+                                  const struct ilo_image_info *info,
+                                  struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   const struct util_format_description *desc =
-      util_format_description(templ->format);
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (ilo_debug & ILO_DEBUG_NOHIZ)
-      return false;
+   layout->walk_layer_h0 = layout->lods[0].slice_height;
 
-   /* we want 8x4 aligned levels */
-   if (templ->target == PIPE_TEXTURE_1D)
-      return false;
-
-   if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL))
-      return false;
-
-   if (!util_format_has_depth(desc))
-      return false;
+   if (info->level_count > 1) {
+      layout->walk_layer_h1 = layout->lods[1].slice_height;
+   } else {
+      int dummy;
+      image_get_gen6_slice_size(dev, info, layout, 1,
+            &dummy, &layout->walk_layer_h1);
+   }
 
-   /* no point in having HiZ */
-   if (templ->usage == PIPE_USAGE_STAGING)
-      return false;
+   if (image_get_gen6_layer_count(dev, info, layout) == 1) {
+      layout->walk_layer_height = 0;
+      return;
+   }
 
    /*
-    * As can be seen in img_calculate_hiz_size(), HiZ may not be enabled
-    * for every level.  This is generally fine except on GEN6, where HiZ and
-    * separate stencil are enabled and disabled at the same time.  When the
-    * format is PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, enabling and disabling HiZ
-    * can result in incompatible formats.
+    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
+    *
+    *     "The following equation is used for surface formats other than
+    *      compressed textures:
+    *
+    *        QPitch = (h0 + h1 + 11j)"
+    *
+    *     "The equation for compressed textures (BC* and FXT1 surface formats)
+    *      follows:
+    *
+    *        QPitch = (h0 + h1 + 11j) / 4"
+    *
+    *     "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the
+    *      value calculated in the equation above, for every other odd Surface
+    *      Height starting from 1 i.e. 1,5,9,13"
+    *
+    * From the Ivy Bridge PRM, volume 1 part 1, page 111-112:
+    *
+    *     "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth
+    *      buffer and stencil buffer have an implied value of ARYSPC_FULL):
+    *
+    *        QPitch = (h0 + h1 + 12j)
+    *        QPitch = (h0 + h1 + 12j) / 4 (compressed)
+    *
+    *      (There are many typos or missing words here...)"
+    *
+    * To access the N-th slice, an offset of (Stride * QPitch * N) is added to
+    * the base address.  The PRM divides QPitch by 4 for compressed formats
+    * because the block height for those formats are 4, and it wants QPitch to
+    * mean the number of memory rows, as opposed to texel rows, between
+    * slices.  Since we use texel rows everywhere, we do not need to divide
+    * QPitch by 4.
     */
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) &&
-       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-       templ->last_level)
-      return false;
+   layout->walk_layer_height = layout->walk_layer_h0 + layout->walk_layer_h1 +
+      ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * layout->align_j;
 
-   return true;
-}
-
-static void
-img_init_aux(struct ilo_image *img,
-             const struct ilo_image_params *params)
-{
-   if (img_want_hiz(img, params))
-      img->aux.type = ILO_IMAGE_AUX_HIZ;
-   else if (img_want_mcs(img, params))
-      img->aux.type = ILO_IMAGE_AUX_MCS;
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && info->sample_count > 1 &&
+       info->height % 4 == 1)
+      layout->walk_layer_height += 4;
 }
 
 static void
-img_align(struct ilo_image *img, struct ilo_image_params *params)
+image_get_gen6_monolithic_size(const struct ilo_dev *dev,
+                               const struct ilo_image_info *info,
+                               struct ilo_image_layout *layout,
+                               int max_x, int max_y)
 {
-   const struct pipe_resource *templ = params->templ;
    int align_w = 1, align_h = 1, pad_h = 0;
 
+   ILO_DEV_ASSERT(dev, 6, 8);
+
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 118:
     *
@@ -864,15 +807,15 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *      padding purposes. The value of 4 for j still applies for mip level
     *      alignment and QPitch calculation."
     */
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
-      align_w = MAX2(align_w, img->align_i);
-      align_h = MAX2(align_h, img->align_j);
+   if (info->bind_surface_sampler) {
+      align_w = MAX2(align_w, layout->align_i);
+      align_h = MAX2(align_h, layout->align_j);
 
-      if (templ->target == PIPE_TEXTURE_CUBE)
+      if (info->type == GEN6_SURFTYPE_CUBE)
          pad_h += 2;
 
-      if (params->compressed)
-         align_h = MAX2(align_h, img->align_j * 2);
+      if (info->compressed)
+         align_h = MAX2(align_h, layout->align_j * 2);
    }
 
    /*
@@ -881,149 +824,288 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *     "If the surface contains an odd number of rows of data, a final row
     *      below the surface must be allocated."
     */
-   if (templ->bind & PIPE_BIND_RENDER_TARGET)
+   if (info->bind_surface_dp_render)
       align_h = MAX2(align_h, 2);
 
    /*
     * Depth Buffer Clear/Resolve works in 8x4 sample blocks.  Pad to allow HiZ
     * for unaligned non-mipmapped and non-array images.
     */
-   if (img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       templ->last_level == 0 &&
-       templ->array_size == 1 &&
-       templ->depth0 == 1) {
+   if (layout->aux == ILO_IMAGE_AUX_HIZ &&
+       info->level_count == 1 && info->array_size == 1 && info->depth == 1) {
       align_w = MAX2(align_w, 8);
       align_h = MAX2(align_h, 4);
    }
 
-   params->max_x = align(params->max_x, align_w);
-   params->max_y = align(params->max_y + pad_h, align_h);
+   layout->monolithic_width = align(max_x, align_w);
+   layout->monolithic_height = align(max_y + pad_h, align_h);
 }
 
-/* note that this may force the texture to be linear */
 static void
-img_calculate_bo_size(struct ilo_image *img,
-                      const struct ilo_image_params *params)
+image_get_gen6_lods(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info,
+                    struct ilo_image_layout *layout)
 {
-   assert(params->max_x % img->block_width == 0);
-   assert(params->max_y % img->block_height == 0);
-   assert(img->walk_layer_height % img->block_height == 0);
+   const int layer_count = image_get_gen6_layer_count(dev, info, layout);
+   int cur_x, cur_y, max_x, max_y;
+   uint8_t lv;
 
-   img->bo_stride =
-      (params->max_x / img->block_width) * img->block_size;
-   img->bo_height = params->max_y / img->block_height;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   while (true) {
-      unsigned w = img->bo_stride, h = img->bo_height;
-      unsigned align_w, align_h;
+   cur_x = 0;
+   cur_y = 0;
+   max_x = 0;
+   max_y = 0;
+   for (lv = 0; lv < info->level_count; lv++) {
+      int slice_w, slice_h, lod_w, lod_h;
 
-      /*
-       * From the Haswell PRM, volume 5, page 163:
-       *
-       *     "For linear surfaces, additional padding of 64 bytes is required
-       *      at the bottom of the surface. This is in addition to the padding
-       *      required above."
-       */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7.5) &&
-          (params->templ->bind & PIPE_BIND_SAMPLER_VIEW) &&
-          img->tiling == GEN6_TILING_NONE)
-         h += (64 + img->bo_stride - 1) / img->bo_stride;
+      image_get_gen6_slice_size(dev, info, layout, lv, &slice_w, &slice_h);
 
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-       *
-       *     "- For linear render target surfaces, the pitch must be a
-       *        multiple of the element size for non-YUV surface formats.
-       *        Pitch must be a multiple of 2 * element size for YUV surface
-       *        formats.
-       *      - For other linear surfaces, the pitch can be any multiple of
-       *        bytes.
-       *      - For tiled surfaces, the pitch must be a multiple of the tile
-       *        width."
-       *
-       * Different requirements may exist when the bo is used in different
-       * places, but our alignments here should be good enough that we do not
-       * need to check params->templ->bind.
-       */
-      switch (img->tiling) {
-      case GEN6_TILING_X:
-         align_w = 512;
-         align_h = 8;
+      layout->lods[lv].x = cur_x;
+      layout->lods[lv].y = cur_y;
+      layout->lods[lv].slice_width = slice_w;
+      layout->lods[lv].slice_height = slice_h;
+
+      switch (layout->walk) {
+      case ILO_IMAGE_WALK_LAYER:
+         lod_w = slice_w;
+         lod_h = slice_h;
+
+         /* MIPLAYOUT_BELOW */
+         if (lv == 1)
+            cur_x += lod_w;
+         else
+            cur_y += lod_h;
          break;
-      case GEN6_TILING_Y:
-         align_w = 128;
-         align_h = 32;
+      case ILO_IMAGE_WALK_LOD:
+         lod_w = slice_w;
+         lod_h = slice_h * layer_count;
+
+         if (lv == 1)
+            cur_x += lod_w;
+         else
+            cur_y += lod_h;
+
+         /* every LOD begins at tile boundaries */
+         if (info->level_count > 1) {
+            assert(info->format == GEN6_FORMAT_R8_UINT);
+            cur_x = align(cur_x, 64);
+            cur_y = align(cur_y, 64);
+         }
          break;
-      case GEN8_TILING_W:
-         /*
-          * From the Sandy Bridge PRM, volume 1 part 2, page 22:
-          *
-          *     "A 4KB tile is subdivided into 8-high by 8-wide array of
-          *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
-          *      bytes."
-          */
-         align_w = 64;
-         align_h = 64;
+      case ILO_IMAGE_WALK_3D:
+         {
+            const int slice_count = u_minify(info->depth, lv);
+            const int slice_count_per_row = 1 << lv;
+            const int row_count =
+               (slice_count + slice_count_per_row - 1) / slice_count_per_row;
+
+            lod_w = slice_w * slice_count_per_row;
+            lod_h = slice_h * row_count;
+         }
+
+         cur_y += lod_h;
          break;
       default:
-         assert(img->tiling == GEN6_TILING_NONE);
-         /* some good enough values */
-         align_w = 64;
-         align_h = 2;
+         assert(!"unknown walk type");
+         lod_w = 0;
+         lod_h = 0;
          break;
       }
 
-      w = align(w, align_w);
-      h = align(h, align_h);
-
-      /* make sure the bo is mappable */
-      if (img->tiling != GEN6_TILING_NONE) {
-         /*
-          * Usually only the first 256MB of the GTT is mappable.
-          *
-          * See also how intel_context::max_gtt_map_object_size is calculated.
-          */
-         const size_t mappable_gtt_size = 256 * 1024 * 1024;
-
-         /*
-          * Be conservative.  We may be able to switch from VALIGN_4 to
-          * VALIGN_2 if the image was Y-tiled, but let's keep it simple.
-          */
-         if (mappable_gtt_size / w / 4 < h) {
-            if (params->valid_tilings & IMAGE_TILING_NONE) {
-               img->tiling = GEN6_TILING_NONE;
-               /* MCS support for non-MSRTs is limited to tiled RTs */
-               if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-                   params->templ->nr_samples <= 1)
-                  img->aux.type = ILO_IMAGE_AUX_NONE;
-
-               continue;
-            } else {
-               ilo_warn("cannot force texture to be linear\n");
-            }
-         }
-      }
+      if (max_x < layout->lods[lv].x + lod_w)
+         max_x = layout->lods[lv].x + lod_w;
+      if (max_y < layout->lods[lv].y + lod_h)
+         max_y = layout->lods[lv].y + lod_h;
+   }
+
+   if (layout->walk == ILO_IMAGE_WALK_LAYER) {
+      image_get_gen6_walk_layer_heights(dev, info, layout);
+      if (layer_count > 1)
+         max_y += layout->walk_layer_height * (layer_count - 1);
+   } else {
+      layout->walk_layer_h0 = 0;
+      layout->walk_layer_h1 = 0;
+      layout->walk_layer_height = 0;
+   }
+
+   image_get_gen6_monolithic_size(dev, info, layout, max_x, max_y);
+}
+
+static bool
+image_bind_gpu(const struct ilo_image_info *info)
+{
+   return (info->bind_surface_sampler ||
+           info->bind_surface_dp_render ||
+           info->bind_surface_dp_typed ||
+           info->bind_zs ||
+           info->bind_scanout ||
+           info->bind_cursor);
+}
+
+static bool
+image_validate_gen6(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "The separate stencil buffer is always enabled, thus the field in
+    *      3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil
+    *      buffer has been removed Surface formats with interleaved depth and
+    *      stencil are no longer supported"
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->bind_zs)
+      assert(!info->interleaved_stencil);
+
+   return true;
+}
+
+static bool
+image_get_gen6_layout(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!image_validate_gen6(dev, info))
+      return false;
+
+   if (image_bind_gpu(info) || info->level_count > 1) {
+      if (!image_init_gen6_hardware_layout(dev, info, layout))
+         return false;
+   } else {
+      if (!image_init_gen6_transfer_layout(dev, info, layout))
+         return false;
+   }
+
+   /*
+    * the fact that align i and j are multiples of block width and height
+    * respectively is what makes the size of the bo a multiple of the block
+    * size, slices start at block boundaries, and many of the computations
+    * work.
+    */
+   assert(layout->align_i % info->block_width == 0);
+   assert(layout->align_j % info->block_height == 0);
+
+   /* make sure align() works */
+   assert(util_is_power_of_two(layout->align_i) &&
+          util_is_power_of_two(layout->align_j));
+   assert(util_is_power_of_two(info->block_width) &&
+          util_is_power_of_two(info->block_height));
+
+   image_get_gen6_lods(dev, info, layout);
+
+   assert(layout->walk_layer_height % info->block_height == 0);
+   assert(layout->monolithic_width % info->block_width == 0);
+   assert(layout->monolithic_height % info->block_height == 0);
+
+   return true;
+}
+
+static bool
+image_set_gen6_bo_size(struct ilo_image *img,
+                       const struct ilo_dev *dev,
+                       const struct ilo_image_info *info,
+                       const struct ilo_image_layout *layout)
+{
+   int stride, height;
+   int align_w, align_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   stride = (layout->monolithic_width / info->block_width) * info->block_size;
+   height = layout->monolithic_height / info->block_height;
+
+   /*
+    * From the Haswell PRM, volume 5, page 163:
+    *
+    *     "For linear surfaces, additional padding of 64 bytes is required
+    *      at the bottom of the surface. This is in addition to the padding
+    *      required above."
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && info->bind_surface_sampler &&
+       layout->tiling == GEN6_TILING_NONE)
+      height += (64 + stride - 1) / stride;
 
-      img->bo_stride = w;
-      img->bo_height = h;
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "- For linear render target surfaces, the pitch must be a multiple
+    *        of the element size for non-YUV surface formats.  Pitch must be a
+    *        multiple of 2 * element size for YUV surface formats.
+    *
+    *      - For other linear surfaces, the pitch can be any multiple of
+    *        bytes.
+    *      - For tiled surfaces, the pitch must be a multiple of the tile
+    *        width."
+    *
+    * Different requirements may exist when the image is used in different
+    * places, but our alignments here should be good enough that we do not
+    * need to check info->bind_x.
+    */
+   switch (layout->tiling) {
+   case GEN6_TILING_X:
+      align_w = 512;
+      align_h = 8;
+      break;
+   case GEN6_TILING_Y:
+      align_w = 128;
+      align_h = 32;
+      break;
+   case GEN8_TILING_W:
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 2, page 22:
+       *
+       *     "A 4KB tile is subdivided into 8-high by 8-wide array of
+       *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
+       *      bytes."
+       */
+      align_w = 64;
+      align_h = 64;
+      break;
+   default:
+      assert(layout->tiling == GEN6_TILING_NONE);
+      /* some good enough values */
+      align_w = 64;
+      align_h = 2;
       break;
    }
+
+   if (info->force_bo_stride) {
+      if (info->force_bo_stride % align_w || info->force_bo_stride < stride)
+         return false;
+
+      img->bo_stride = info->force_bo_stride;
+   } else {
+      img->bo_stride = align(stride, align_w);
+   }
+
+   img->bo_height = align(height, align_h);
+
+   return true;
 }
 
-static void
-img_calculate_hiz_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_set_gen6_hiz(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   const unsigned hz_align_j = 8;
+   const int hz_align_j = 8;
    enum ilo_image_walk_type hz_walk;
-   unsigned hz_width, hz_height, lv;
-   unsigned hz_clear_w, hz_clear_h;
+   int hz_width, hz_height;
+   int hz_clear_w, hz_clear_h;
+   uint8_t lv;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   assert(img->aux.type == ILO_IMAGE_AUX_HIZ);
+   assert(layout->aux == ILO_IMAGE_AUX_HIZ);
 
-   assert(img->walk == ILO_IMAGE_WALK_LAYER ||
-          img->walk == ILO_IMAGE_WALK_3D);
+   assert(layout->walk == ILO_IMAGE_WALK_LAYER ||
+          layout->walk == ILO_IMAGE_WALK_3D);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 312:
@@ -1036,8 +1118,8 @@ img_calculate_hiz_size(struct ilo_image *img,
     *
     * We will put all LODs in a single bo with ILO_IMAGE_WALK_LOD.
     */
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      hz_walk = img->walk;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      hz_walk = layout->walk;
    else
       hz_walk = ILO_IMAGE_WALK_LOD;
 
@@ -1051,16 +1133,16 @@ img_calculate_hiz_size(struct ilo_image *img,
    switch (hz_walk) {
    case ILO_IMAGE_WALK_LAYER:
       {
-         const unsigned h0 = align(params->h0, hz_align_j);
-         const unsigned h1 = align(params->h1, hz_align_j);
-         const unsigned htail =
-            ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
-         const unsigned hz_qpitch = h0 + h1 + htail;
+         const int h0 = align(layout->walk_layer_h0, hz_align_j);
+         const int h1 = align(layout->walk_layer_h1, hz_align_j);
+         const int htail =
+            ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
+         const int hz_qpitch = h0 + h1 + htail;
 
-         hz_width = align(img->lods[0].slice_width, 16);
+         hz_width = align(layout->lods[0].slice_width, 16);
 
-         hz_height = hz_qpitch * templ->array_size / 2;
-         if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
+         hz_height = hz_qpitch * info->array_size / 2;
+         if (ilo_dev_gen(dev) >= ILO_GEN(7))
             hz_height = align(hz_height, 8);
 
          img->aux.walk_layer_height = hz_qpitch;
@@ -1068,27 +1150,27 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    case ILO_IMAGE_WALK_LOD:
       {
-         unsigned lod_tx[PIPE_MAX_TEXTURE_LEVELS];
-         unsigned lod_ty[PIPE_MAX_TEXTURE_LEVELS];
-         unsigned cur_tx, cur_ty;
+         int lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int cur_tx, cur_ty;
 
          /* figure out the tile offsets of LODs */
          hz_width = 0;
          hz_height = 0;
          cur_tx = 0;
          cur_ty = 0;
-         for (lv = 0; lv <= templ->last_level; lv++) {
-            unsigned tw, th;
+         for (lv = 0; lv < info->level_count; lv++) {
+            int tw, th;
 
             lod_tx[lv] = cur_tx;
             lod_ty[lv] = cur_ty;
 
-            tw = align(img->lods[lv].slice_width, 16);
-            th = align(img->lods[lv].slice_height, hz_align_j) *
-               templ->array_size / 2;
+            tw = align(layout->lods[lv].slice_width, 16);
+            th = align(layout->lods[lv].slice_height, hz_align_j) *
+               info->array_size / 2;
             /* convert to Y-tiles */
-            tw = align(tw, 128) / 128;
-            th = align(th, 32) / 32;
+            tw = (tw + 127) / 128;
+            th = (th + 31) / 32;
 
             if (hz_width < cur_tx + tw)
                hz_width = cur_tx + tw;
@@ -1102,22 +1184,23 @@ img_calculate_hiz_size(struct ilo_image *img,
          }
 
          /* convert tile offsets to memory offsets */
-         for (lv = 0; lv <= templ->last_level; lv++) {
+         for (lv = 0; lv < info->level_count; lv++) {
             img->aux.walk_lod_offsets[lv] =
                (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096;
          }
+
          hz_width *= 128;
          hz_height *= 32;
       }
       break;
    case ILO_IMAGE_WALK_3D:
-      hz_width = align(img->lods[0].slice_width, 16);
+      hz_width = align(layout->lods[0].slice_width, 16);
 
       hz_height = 0;
-      for (lv = 0; lv <= templ->last_level; lv++) {
-         const unsigned h = align(img->lods[lv].slice_height, hz_align_j);
+      for (lv = 0; lv < info->level_count; lv++) {
+         const int h = align(layout->lods[lv].slice_height, hz_align_j);
          /* according to the formula, slices are packed together vertically */
-         hz_height += h * u_minify(templ->depth0, lv);
+         hz_height += h * u_minify(info->depth, lv);
       }
       hz_height /= 2;
       break;
@@ -1136,8 +1219,7 @@ img_calculate_hiz_size(struct ilo_image *img,
     */
    hz_clear_w = 8;
    hz_clear_h = 4;
-   switch (templ->nr_samples) {
-   case 0:
+   switch (info->sample_count) {
    case 1:
    default:
       break;
@@ -1158,33 +1240,38 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    }
 
-   for (lv = 0; lv <= templ->last_level; lv++) {
-      if (u_minify(img->width0, lv) % hz_clear_w ||
-          u_minify(img->height0, lv) % hz_clear_h)
+   for (lv = 0; lv < info->level_count; lv++) {
+      if (u_minify(info->width, lv) % hz_clear_w ||
+          u_minify(info->height, lv) % hz_clear_h)
          break;
       img->aux.enables |= 1 << lv;
    }
 
-   /* we padded to allow this in img_align() */
-   if (templ->last_level == 0 && templ->array_size == 1 && templ->depth0 == 1)
+   /* we padded to allow this in image_get_gen6_monolithic_size() */
+   if (info->level_count == 1 && info->array_size == 1 && info->depth == 1)
       img->aux.enables |= 0x1;
 
    /* align to Y-tile */
    img->aux.bo_stride = align(hz_width, 128);
    img->aux.bo_height = align(hz_height, 32);
+
+   return true;
 }
 
-static void
-img_calculate_mcs_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_set_gen7_mcs(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
    int mcs_width, mcs_height, mcs_cpp;
    int downscale_x, downscale_y;
 
-   assert(img->aux.type == ILO_IMAGE_AUX_MCS);
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   assert(layout->aux == ILO_IMAGE_AUX_MCS);
 
-   if (templ->nr_samples > 1) {
+   if (info->sample_count > 1) {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 326, the clear
        * rectangle is scaled down by 8x2 for 4X MSAA and 2x2 for 8X MSAA.  The
@@ -1198,7 +1285,7 @@ img_calculate_mcs_size(struct ilo_image *img,
        * RT.  Similarly, we could reason that an OWord in 4X MCS maps to a 8x2
        * pixel block in the RT.
        */
-      switch (templ->nr_samples) {
+      switch (info->sample_count) {
       case 2:
       case 4:
          downscale_x = 8;
@@ -1217,7 +1304,7 @@ img_calculate_mcs_size(struct ilo_image *img,
          break;
       default:
          assert(!"unsupported sample count");
-         return;
+         return false;
          break;
       }
 
@@ -1226,8 +1313,8 @@ img_calculate_mcs_size(struct ilo_image *img,
        * clear rectangle cannot be masked.  The scale-down clear rectangle
        * thus must be aligned to 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 2);
-      mcs_height = align(img->height0, downscale_y * 2);
+      mcs_width = align(info->width, downscale_x * 2);
+      mcs_height = align(info->height, downscale_y * 2);
    } else {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 327:
@@ -1262,18 +1349,18 @@ img_calculate_mcs_size(struct ilo_image *img,
        * anything except for the size of the allocated MCS.  Let's see if we
        * hit out-of-bound access.
        */
-      switch (img->tiling) {
+      switch (layout->tiling) {
       case GEN6_TILING_X:
-         downscale_x = 64 / img->block_size;
+         downscale_x = 64 / info->block_size;
          downscale_y = 2;
          break;
       case GEN6_TILING_Y:
-         downscale_x = 32 / img->block_size;
+         downscale_x = 32 / info->block_size;
          downscale_y = 4;
          break;
       default:
          assert(!"unsupported tiling mode");
-         return;
+         return false;
          break;
       }
 
@@ -1290,181 +1377,75 @@ img_calculate_mcs_size(struct ilo_image *img,
        * The scaled-down clear rectangle must be aligned to 4x4 instead of
        * 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 4) / downscale_x;
-      mcs_height = align(img->height0, downscale_y * 4) / downscale_y;
+      mcs_width = align(info->width, downscale_x * 4) / downscale_x;
+      mcs_height = align(info->height, downscale_y * 4) / downscale_y;
       mcs_cpp = 16; /* an OWord */
    }
 
-   img->aux.enables = (1 << (templ->last_level + 1)) - 1;
+   img->aux.enables = (1 << info->level_count) - 1;
    /* align to Y-tile */
    img->aux.bo_stride = align(mcs_width * mcs_cpp, 128);
    img->aux.bo_height = align(mcs_height, 32);
-}
-
-static void
-img_init(struct ilo_image *img,
-         struct ilo_image_params *params)
-{
-   /* there are hard dependencies between every function here */
-
-   img_init_aux(img, params);
-   img_init_size_and_format(img, params);
-   img_init_walk(img, params);
-   img_init_tiling(img, params);
-   img_init_alignments(img, params);
-   img_init_lods(img, params);
-   img_init_layer_height(img, params);
-
-   img_align(img, params);
-   img_calculate_bo_size(img, params);
 
-   img->scanout = (params->templ->bind & PIPE_BIND_SCANOUT);
-
-   switch (img->aux.type) {
-   case ILO_IMAGE_AUX_HIZ:
-      img_calculate_hiz_size(img, params);
-      break;
-   case ILO_IMAGE_AUX_MCS:
-      img_calculate_mcs_size(img, params);
-      break;
-   default:
-      break;
-   }
-}
-
-/**
- * The texutre is for transfer only.  We can define our own layout to save
- * space.
- */
-static void
-img_init_for_transfer(struct ilo_image *img,
-                      const struct ilo_dev *dev,
-                      const struct pipe_resource *templ)
-{
-   const unsigned num_layers = (templ->target == PIPE_TEXTURE_3D) ?
-      templ->depth0 : templ->array_size;
-   unsigned layer_width, layer_height;
-
-   assert(templ->last_level == 0);
-   assert(templ->nr_samples <= 1);
-
-   img->aux.type = ILO_IMAGE_AUX_NONE;
-
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
-   img->level_count = 1;
-   img->sample_count = 1;
-
-   img->format = templ->format;
-   img->block_width = util_format_get_blockwidth(templ->format);
-   img->block_height = util_format_get_blockheight(templ->format);
-   img->block_size = util_format_get_blocksize(templ->format);
-
-   img->walk = ILO_IMAGE_WALK_LOD;
-
-   img->tiling = GEN6_TILING_NONE;
-
-   img->align_i = img->block_width;
-   img->align_j = img->block_height;
-
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
-
-   /* use packed layout */
-   layer_width = align(templ->width0, img->align_i);
-   layer_height = align(templ->height0, img->align_j);
-
-   img->lods[0].slice_width = layer_width;
-   img->lods[0].slice_height = layer_height;
-
-   img->bo_stride = (layer_width / img->block_width) * img->block_size;
-   img->bo_stride = align(img->bo_stride, 64);
-
-   img->bo_height = (layer_height / img->block_height) * num_layers;
+   return true;
 }
 
-/**
- * Initialize the image.  Callers should zero-initialize \p img first.
- */
-void ilo_image_init(struct ilo_image *img,
-                    const struct ilo_dev *dev,
-                    const struct pipe_resource *templ)
+bool
+ilo_image_init(struct ilo_image *img,
+               const struct ilo_dev *dev,
+               const struct ilo_image_info *info)
 {
-   struct ilo_image_params params;
-   bool transfer_only;
+   struct ilo_image_layout layout;
 
    assert(ilo_is_zeroed(img, sizeof(*img)));
 
-   /* use transfer layout when the texture is never bound to GPU */
-   transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE |
-                                     PIPE_BIND_TRANSFER_READ));
-   if (transfer_only && templ->last_level == 0 && templ->nr_samples <= 1) {
-      img_init_for_transfer(img, dev, templ);
-      return;
-   }
+   memset(&layout, 0, sizeof(layout));
+   layout.lods = img->lods;
 
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = IMAGE_TILING_ALL;
+   if (!image_get_gen6_layout(dev, info, &layout))
+      return false;
 
-   img_init(img, &params);
-}
+   img->type = info->type;
 
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride)
-{
-   struct ilo_image_params params;
+   img->format = info->format;
+   img->block_width = info->block_width;
+   img->block_height = info->block_height;
+   img->block_size = info->block_size;
 
-   assert(ilo_is_zeroed(img, sizeof(*img)));
+   img->width0 = info->width;
+   img->height0 = info->height;
+   img->depth0 = info->depth;
+   img->array_size = info->array_size;
+   img->level_count = info->level_count;
+   img->sample_count = info->sample_count;
 
-   if ((tiling == GEN6_TILING_X && bo_stride % 512) ||
-       (tiling == GEN6_TILING_Y && bo_stride % 128) ||
-       (tiling == GEN8_TILING_W && bo_stride % 64))
-      return false;
+   img->walk = layout.walk;
+   img->interleaved_samples = layout.interleaved_samples;
 
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = 1 << tiling;
+   img->tiling = layout.tiling;
 
-   img_init(img, &params);
+   img->aux.type = layout.aux;
 
-   assert(img->tiling == tiling);
-   if (img->bo_stride > bo_stride)
-      return false;
-
-   img->bo_stride = bo_stride;
-
-   /* assume imported RTs are also scanouts */
-   if (!img->scanout)
-      img->scanout = (templ->bind & PIPE_BIND_RENDER_TARGET);
+   img->align_i = layout.align_i;
+   img->align_j = layout.align_j;
 
-   return true;
-}
+   img->walk_layer_height = layout.walk_layer_height;
 
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev)
-{
-   /* HiZ is required for separate stencil on Gen6 */
-   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
-       img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       img->separate_stencil)
+   if (!image_set_gen6_bo_size(img, dev, info, &layout))
       return false;
 
-   /* MCS is required for multisample images */
-   if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-       img->sample_count > 1)
-      return false;
+   img->scanout = info->bind_scanout;
 
-   img->aux.enables = 0x0;
+   switch (layout.aux) {
+   case ILO_IMAGE_AUX_HIZ:
+      image_set_gen6_hiz(img, dev, info, &layout);
+      break;
+   case ILO_IMAGE_AUX_MCS:
+      image_set_gen7_mcs(img, dev, info, &layout);
+      break;
+   default:
+      break;
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index af15e856028..646ed6f5727 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -29,11 +29,17 @@
 #define ILO_IMAGE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+/*
+ * From the Ivy Bridge PRM, volume 4 part 1, page 75:
+ *
+ *     "(MIP Count / LOD) representing [1,15] MIP levels"
+ */
+#define ILO_IMAGE_MAX_LEVEL_COUNT 15
+
 enum ilo_image_aux_type {
    ILO_IMAGE_AUX_NONE,
    ILO_IMAGE_AUX_HIZ,
@@ -68,6 +74,49 @@ enum ilo_image_walk_type {
    ILO_IMAGE_WALK_3D,
 };
 
+struct ilo_image_info {
+   enum gen_surface_type type;
+
+   enum gen_surface_format format;
+   bool interleaved_stencil;
+   bool is_integer;
+   /* width, height and size of pixel blocks */
+   bool compressed;
+   unsigned block_width;
+   unsigned block_height;
+   unsigned block_size;
+
+   /* image size */
+   uint16_t width;
+   uint16_t height;
+   uint16_t depth;
+   uint16_t array_size;
+   uint8_t level_count;
+   uint8_t sample_count;
+
+   /* disable optional aux */
+   bool aux_disable;
+
+   /* tilings to consider, if any bit is set */
+   uint8_t valid_tilings;
+
+   /*
+    * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the
+    * threshold
+    */
+   uint32_t prefer_linear_threshold;
+
+   /* force a stride when non-zero */
+   uint32_t force_bo_stride;
+
+   bool bind_surface_sampler;
+   bool bind_surface_dp_render;
+   bool bind_surface_dp_typed;
+   bool bind_zs;
+   bool bind_scanout;
+   bool bind_cursor;
+};
+
 /*
  * When the walk type is ILO_IMAGE_WALK_LAYER, there is only a slice in each
  * LOD and this is used to describe LODs in the first array layer.  Otherwise,
@@ -88,7 +137,10 @@ struct ilo_image_lod {
  * Texture layout.
  */
 struct ilo_image {
-   enum pipe_texture_target target;
+   enum gen_surface_type type;
+
+   enum gen_surface_format format;
+   bool interleaved_stencil;
 
    /* size, format, etc for programming hardware states */
    unsigned width0;
@@ -97,8 +149,6 @@ struct ilo_image {
    unsigned array_size;
    unsigned level_count;
    unsigned sample_count;
-   enum pipe_format format;
-   bool separate_stencil;
 
    /*
     * width, height, and size of pixel blocks for conversion between pixel
@@ -117,7 +167,7 @@ struct ilo_image {
    unsigned align_i;
    unsigned align_j;
 
-   struct ilo_image_lod lods[PIPE_MAX_TEXTURE_LEVELS];
+   struct ilo_image_lod lods[ILO_IMAGE_MAX_LEVEL_COUNT];
 
    /* physical layer height for ILO_IMAGE_WALK_LAYER */
    unsigned walk_layer_height;
@@ -136,36 +186,18 @@ struct ilo_image {
       unsigned enables;
 
       /* LOD offsets for ILO_IMAGE_WALK_LOD */
-      unsigned walk_lod_offsets[PIPE_MAX_TEXTURE_LEVELS];
+      unsigned walk_lod_offsets[ILO_IMAGE_MAX_LEVEL_COUNT];
 
       unsigned walk_layer_height;
       unsigned bo_stride;
       unsigned bo_height;
-
-      /* managed by users */
-      struct intel_bo *bo;
    } aux;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
-struct pipe_resource;
-
-void
+bool
 ilo_image_init(struct ilo_image *img,
                const struct ilo_dev *dev,
-               const struct pipe_resource *templ);
-
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride);
-
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev);
+               const struct ilo_image_info *info);
 
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
index 38c0b719ab3..6ef2c91a592 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_sol.h"
 
 static bool
@@ -270,9 +270,6 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
@@ -281,9 +278,17 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
     */
    assert(info->offset % 4 == 0);
 
+   if (info->vma) {
+      assert(info->vma->vm_alignment % 4 == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
+
    /* Gen8+ only */
-   if (info->write_offset_load || info->write_offset_save)
-      assert(ilo_dev_gen(dev) >= ILO_GEN(8));
+   if (info->write_offset_load || info->write_offset_save) {
+      assert(ilo_dev_gen(dev) >= ILO_GEN(8) && info->write_offset_vma);
+      assert(info->write_offset_offset + sizeof(uint32_t) <=
+            info->write_offset_vma->vm_size);
+   }
 
    /*
     * From the Broadwell PRM, volume 2b, page 206:
@@ -304,25 +309,15 @@ static uint32_t
 sol_buffer_get_gen6_size(const struct ilo_dev *dev,
                          const struct ilo_state_sol_buffer_info *info)
 {
-   uint32_t size;
-
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
-      return 0;
-
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
     *     "(Surface End Address) This field specifies the ending DWord
     *      address..."
     */
-   size &= ~3;
-
-   return size;
+   return (info->vma) ? info->size & ~3 : 0;
 }
 
 static bool
@@ -359,7 +354,7 @@ sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
 
    dw1 = 0;
 
-   if (info->buf)
+   if (info->vma)
       dw1 |= GEN8_SO_BUF_DW1_ENABLE;
    if (info->write_offset_load)
       dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE;
@@ -429,6 +424,15 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
    return ilo_state_sol_init(sol, dev, &info);
 }
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment)
+{
+   /* DWord aligned without padding */
+   *alignment = 4;
+   return size;
+}
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
@@ -443,9 +447,8 @@ ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
    else
       ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info);
 
-   sb->need_bo = (info->size > 0);
-   sb->need_write_offset_bo = (info->write_offset_save ||
-         (info->write_offset_load && !info->write_offset_imm_enable));
+   sb->vma = info->vma;
+   sb->write_offset_vma = info->write_offset_vma;
 
    assert(ret);
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
index 2513fcb4979..92c5f94725b 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -107,17 +107,17 @@ struct ilo_state_sol {
    uint8_t decl_count;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_sol_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
-   /*
-    * Gen8+ only.  When enabled, require a write offset bo of at least
-    * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes
-    */
+   /* Gen8+ only; at least sizeof(uint32_t) bytes */
+   const struct ilo_vma *write_offset_vma;
+   uint32_t write_offset_offset;
+
    bool write_offset_load;
    bool write_offset_save;
 
@@ -126,14 +126,10 @@ struct ilo_state_sol_buffer_info {
 };
 
 struct ilo_state_sol_buffer {
-   uint32_t so_buf[4];
-
-   bool need_bo;
-   bool need_write_offset_bo;
+   uint32_t so_buf[5];
 
-   /* managed by users */
-   struct intel_bo *bo;
-   struct intel_bo *write_offset_bo;
+   const struct ilo_vma *vma;
+   const struct ilo_vma *write_offset_vma;
 };
 
 static inline size_t
@@ -154,6 +150,10 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
                             const struct ilo_dev *dev,
                             bool render_disable);
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment);
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 5be9f8f6270..40fe15f316f 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -26,8 +26,8 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_surface.h"
 
 static bool
@@ -94,31 +94,13 @@ surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
-static bool
-surface_validate_gen6_buffer(const struct ilo_dev *dev,
-                             const struct ilo_state_surface_buffer_info *info)
+static uint32_t
+surface_get_gen6_buffer_offset_alignment(const struct ilo_dev *dev,
+                                         const struct ilo_state_surface_buffer_info *info)
 {
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   /* SVB writes are Gen6-only */
-   if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
-
-   if (info->offset + info->size > info->buf->bo_size) {
-      ilo_warn("invalid buffer range\n");
-      return false;
-   }
+   uint32_t alignment;
 
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-    *
-    *     "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B]
-    *      For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]"
-    */
-   if (!info->struct_size || info->struct_size > 2048) {
-      ilo_warn("invalid buffer struct size\n");
-      return false;
-   }
+   ILO_DEV_ASSERT(dev, 6, 8);
 
    /*
     * From the Ivy Bridge PRM, volume 4 part 1, page 68:
@@ -132,76 +114,153 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
     *     "Certain message types used to access surfaces have more stringent
     *      alignment requirements. Please refer to the specific message
     *      documentation for additional restrictions."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
-    *
-    *     "the surface base address must be OWord aligned"
-    *
-    * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual
-    * Block Read/Write.
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
-    *
-    *     "The surface base address must be DWord aligned"
-    *
-    * for DWord Scattered Read/Write and Byte Scattered Read/Write.
-    *
-    * We have to rely on users to correctly set info->struct_size here.  DWord
-    * Scattered Read/Write has conflicting pitch and alignment, but we do not
-    * use them yet so we are fine.
-    *
-    * It is unclear if sampling engine surfaces require aligned offsets.
     */
-   if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
-      assert(info->struct_size % info->format_size == 0);
+   switch (info->access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /* no alignment requirements */
+      alignment = 1;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned */
+      alignment = info->format_size;
 
-      if (info->offset % info->struct_size) {
-         ilo_warn("bad buffer offset\n");
-         return false;
-      }
-   }
+      assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /*
+       * Nothing is said about Untyped* messages, but I think they require the
+       * base address to be DWord aligned.
+       */
+      alignment = 4;
 
-   if (info->format == GEN6_FORMAT_RAW) {
       /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 97:
+       * From the Ivy Bridge PRM, volume 4 part 1, page 70:
+       *
+       *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
+       *      pitch must be a multiple of 4 bytes."
+       */
+      if (info->struct_size > 1)
+         assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
+       *
+       *     "the surface base address must be OWord aligned"
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord
+       * Dual Block Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
        *
-       *     ""RAW" is supported only with buffers and structured buffers
-       *      accessed via the untyped surface read/write and untyped atomic
-       *      operation messages, which do not have a column in the table."
+       *     "The surface base address must be DWord aligned"
        *
-       * We do not have a specific access mode for untyped messages.
+       * for DWord Scattered Read/Write and Byte Scattered Read/Write.
        */
-      assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED);
+      alignment = (info->format_size > 4) ? 16 : 4;
 
       /*
-       * Nothing is said about Untyped* messages, but I guess they require the
-       * base address to be DWord aligned.
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, 237, and
+       * 246:
+       *
+       *     "the surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 16 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, OWord
+       * Dual Block Read/Write, and DWord Scattered Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 248:
+       *
+       *     "The surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 4 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for Byte Scattered Read/Write.
+       *
+       * It is programmable on Gen7.5+.
        */
-      if (info->offset % 4) {
-         ilo_warn("bad RAW buffer offset\n");
-         return false;
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5)) {
+         const int fixed = (info->format_size > 1) ? 16 : 4;
+         assert(info->struct_size == fixed);
       }
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 259:
+       *
+       *     "Both the surface base address and surface pitch must be DWord
+       *      aligned."
+       */
+      alignment = 4;
 
-      if (info->struct_size > 1) {
-         /* no STRBUF on Gen6 */
-         if (ilo_dev_gen(dev) == ILO_GEN(6)) {
-            ilo_warn("no STRBUF support\n");
-            return false;
-         }
+      assert(info->struct_size % alignment == 0);
+      break;
+   default:
+      assert(!"unknown access");
+      alignment = 1;
+      break;
+   }
 
-         /*
-          * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-          *
-          *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
-          *      pitch must be a multiple of 4 bytes."
-          */
-         if (info->struct_size % 4) {
-            ilo_warn("bad STRBUF pitch\n");
-            return false;
-         }
-      }
+   return alignment;
+}
+
+static bool
+surface_validate_gen6_buffer(const struct ilo_dev *dev,
+                             const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t alignment;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->offset + info->size > info->vma->vm_size) {
+      ilo_warn("invalid buffer range\n");
+      return false;
    }
 
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B]
+    *      For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]"
+    */
+   if (!info->struct_size || info->struct_size > 2048) {
+      ilo_warn("invalid buffer struct size\n");
+      return false;
+   }
+
+   alignment = surface_get_gen6_buffer_offset_alignment(dev, info);
+   if (info->offset % alignment || info->vma->vm_alignment % alignment) {
+      ilo_warn("bad buffer offset\n");
+      return false;
+   }
+
+   /* no STRBUF on Gen6 */
+   if (info->format == GEN6_FORMAT_RAW && info->struct_size > 1)
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+
+   /* SVB writes are Gen6 only */
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB)
+      assert(ilo_dev_gen(dev) == ILO_GEN(6));
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 83:
+    *
+    *     "NOTE: "RAW" is supported only with buffers and structured buffers
+    *      accessed via the untyped surface read/write and untyped atomic
+    *      operation messages, which do not have a column in the table."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 252:
+    *
+    *     "For untyped messages, the Surface Format must be RAW and the
+    *      Surface Type must be SURFTYPE_BUFFER or SURFTYPE_STRBUF."
+    */
+   assert((info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED) ==
+          (info->format == GEN6_FORMAT_RAW));
+
    return true;
 }
 
@@ -215,8 +274,7 @@ surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev,
    ILO_DEV_ASSERT(dev, 6, 8);
 
    c = info->size / info->struct_size;
-   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB &&
-       info->format_size < info->size - info->struct_size * c)
+   if (info->format_size < info->size - info->struct_size * c)
       c++;
 
    /*
@@ -367,29 +425,6 @@ surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
 static bool
 surface_validate_gen6_image(const struct ilo_dev *dev,
                             const struct ilo_state_surface_image_info *info)
@@ -408,6 +443,17 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
       break;
    }
 
+   assert(info->img && info->vma);
+
+   if (info->img->tiling != GEN6_TILING_NONE)
+      assert(info->vma->vm_alignment % 4096 == 0);
+
+   if (info->aux_vma) {
+      assert(ilo_image_can_enable_aux(info->img, info->level_base));
+      /* always tiled */
+      assert(info->aux_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 78:
     *
@@ -418,16 +464,18 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
    assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 &&
           info->img->width0 <= info->img->bo_stride);
 
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D);
+   if (info->type != info->img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             info->img->type == GEN6_SURFTYPE_CUBE);
+   }
 
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 78:
-       *
-       *     "For cube maps, Width must be set equal to the Height."
-       */
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+    *
+    *     "For cube maps, Width must be set equal to the Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
       assert(info->img->width0 == info->img->height0);
-   }
 
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 72:
@@ -463,20 +511,21 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+surface_get_gen6_image_max_extent(const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_image_info *info,
+                                  uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -504,7 +553,7 @@ surface_get_gen6_image_extent(const struct ilo_dev *dev,
    w = info->img->width0;
    h = info->img->height0;
 
-   get_gen6_max_extent(dev, info->img, &max_w, &max_h);
+   surface_get_gen6_image_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -555,16 +604,17 @@ surface_get_gen6_image_slices(const struct ilo_dev *dev,
     * layers to (86 * 6), about 512.
     */
 
-   switch (get_gen6_surface_type(dev, info->img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512;
 
       assert(info->img->array_size <= max_slice);
       max_slice = info->img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
             if (!d || d % 6) {
                ilo_warn("invalid cube slice count\n");
@@ -877,7 +927,6 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    enum gen_sample_count sample_count;
    uint32_t alignments;
-   enum gen_surface_type type;
    uint32_t dw0, dw2, dw3, dw4, dw5;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -897,10 +946,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    if (info->img->sample_count > 1)
       assert(info->img->interleaved_samples);
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN6_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
          GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
 
@@ -927,7 +973,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
     *      field must be programmed to 111111b (all faces enabled)."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
       dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE |
              GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
@@ -956,7 +1002,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    surf->surface[4] = dw4;
    surf->surface[5] = dw5;
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
@@ -972,7 +1018,6 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    uint32_t alignments;
    enum gen_sample_count sample_count;
-   enum gen_surface_type type;
    uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7;
 
    ILO_DEV_ASSERT(dev, 7, 8);
@@ -986,10 +1031,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
        !surface_get_gen6_image_alignments(dev, info, &alignments))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN7_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT |
          alignments;
 
@@ -1023,7 +1065,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *      field must be programmed to 111111b (all faces enabled). This field
     *      is ignored unless the Surface Type is SURFTYPE_CUBE."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER)
       dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
 
@@ -1087,13 +1129,61 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
       surf->surface[12] = 0;
    }
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
    return true;
 }
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment)
+{
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 1, page 118:
+       *
+       *     "For buffers, which have no inherent "height," padding
+       *      requirements are different. A buffer must be padded to the next
+       *      multiple of 256 array elements, with an additional 16 bytes
+       *      added beyond that to account for the L1 cache line."
+       *
+       * Assuming tightly packed GEN6_FORMAT_R32G32B32A32_FLOAT, the size
+       * needs to be padded to 4096 (= 16 * 256).
+       */
+      *alignment = 1;
+      size = align(size, 4096) + 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned for worst cases */
+      *alignment = 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /* DWord aligned? */
+      *alignment = 4;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /* OWord aligned */
+      *alignment = 16;
+      size = align(size, 16);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /* always DWord aligned */
+      *alignment = 4;
+      break;
+   default:
+      assert(!"unknown access");
+      *alignment = 1;
+      break;
+   }
+
+   return size;
+}
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev)
@@ -1107,6 +1197,7 @@ ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev);
 
+   surf->vma = NULL;
    surf->type = GEN6_SURFTYPE_NULL;
    surf->readonly = true;
 
@@ -1129,6 +1220,7 @@ ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
    surf->readonly = info->readonly;
 
    assert(ret);
@@ -1150,6 +1242,9 @@ ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
+   surf->aux_vma = info->aux_vma;
+
    surf->is_integer = info->is_integer;
    surf->readonly = info->readonly;
    surf->scanout = info->img->scanout;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index 9c025428d50..e78c7c97db1 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -29,14 +29,10 @@
 #define ILO_STATE_SURFACE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
-struct ilo_buffer;
-struct ilo_image;
-
 enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_SAMPLER,      /* sampling engine surfaces */
    ILO_STATE_SURFACE_ACCESS_DP_RENDER,    /* render target surfaces */
@@ -46,42 +42,51 @@ enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_DP_SVB,
 };
 
+struct ilo_vma;
+struct ilo_image;
+
 struct ilo_state_surface_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
+   uint32_t offset;
+   uint32_t size;
 
    enum ilo_state_surface_access access;
 
+   /* format_size may be less than, equal to, or greater than struct_size */
    enum gen_surface_format format;
    uint8_t format_size;
 
    bool readonly;
    uint16_t struct_size;
-
-   uint32_t offset;
-   uint32_t size;
 };
 
 struct ilo_state_surface_image_info {
    const struct ilo_image *img;
+   uint8_t level_base;
+   uint8_t level_count;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
 
    enum ilo_state_surface_access access;
 
+   enum gen_surface_type type;
+
    enum gen_surface_format format;
    bool is_integer;
 
    bool readonly;
-   bool is_cube_map;
    bool is_array;
-
-   uint8_t level_base;
-   uint8_t level_count;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_surface {
    uint32_t surface[13];
 
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
+
    enum gen_surface_type type;
    uint8_t min_lod;
    uint8_t mip_count;
@@ -89,9 +94,6 @@ struct ilo_state_surface {
 
    bool readonly;
    bool scanout;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
 bool
@@ -99,6 +101,11 @@ ilo_state_surface_valid_format(const struct ilo_dev *dev,
                                enum ilo_state_surface_access access,
                                enum gen_surface_format format);
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment);
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev);
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
index ddc75428ed7..9faf835fef2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_vf.h"
 
 static bool
@@ -479,8 +479,8 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma)
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 86:
@@ -500,6 +500,9 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
     *      aligned address, and BufferPitch must be a multiple of 64-bits."
     */
    if (info->cv_has_double) {
+      if (info->vma)
+         assert(info->vma->vm_alignment % 8 == 0);
+
       assert(info->stride % 8 == 0);
       assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0);
    }
@@ -512,12 +515,7 @@ vertex_buffer_get_gen6_size(const struct ilo_dev *dev,
                             const struct ilo_state_vertex_buffer_info *info)
 {
    ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (!info->buf)
-      return 0;
-
-   return (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
+   return (info->vma) ? info->size : 0;
 }
 
 static bool
@@ -537,7 +535,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
 
    if (ilo_dev_gen(dev) >= ILO_GEN(7))
       dw0 |= GEN7_VB_DW0_ADDR_MODIFIED;
-   if (!info->buf)
+   if (!info->vma)
       dw0 |= GEN6_VB_DW0_IS_NULL;
 
    STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3);
@@ -551,7 +549,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
       vb->vb[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   vb->need_bo = (info->buf != NULL);
+   vb->vma = info->vma;
 
    return true;
 }
@@ -586,8 +584,10 @@ index_buffer_validate_gen6(const struct ilo_dev *dev,
     */
    assert(info->offset % format_size == 0);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma) {
+      assert(info->vma->vm_alignment % format_size == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
 
    return true;
 }
@@ -600,12 +600,10 @@ index_buffer_get_gen6_size(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
+   if (!info->vma)
       return 0;
 
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
+   size = info->size;
    if (ilo_dev_gen(dev) < ILO_GEN(8)) {
       const uint32_t format_size = get_index_format_size(info->format);
       size -= (size % format_size);
@@ -638,7 +636,7 @@ index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib,
       ib->ib[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   ib->need_bo = (info->buf != NULL);
+   ib->vma = info->vma;
 
    return true;
 }
@@ -949,6 +947,15 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
    }
 }
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment)
+{
+   /* align for doubles without padding */
+   *alignment = 8;
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
@@ -966,6 +973,15 @@ ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
    return ret;
 }
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment)
+{
+   /* align for the worst case without padding */
+   *alignment = get_index_format_size(GEN6_INDEX_DWORD);
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
index f15c63a248a..16b128bf63c 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -126,10 +126,10 @@ struct ilo_state_vf_delta {
    uint32_t dirty;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_vertex_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -143,14 +143,11 @@ struct ilo_state_vertex_buffer_info {
 struct ilo_state_vertex_buffer {
    uint32_t vb[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 struct ilo_state_index_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -160,10 +157,7 @@ struct ilo_state_index_buffer_info {
 struct ilo_state_index_buffer {
    uint32_t ib[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 static inline size_t
@@ -215,11 +209,19 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
                        const struct ilo_state_vf *old,
                        struct ilo_state_vf_delta *delta);
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment);
+
 bool
 ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_vertex_buffer_info *info);
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment);
+
 bool
 ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
                                 const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
index 901fedb5599..827632764b2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -25,10 +25,9 @@
  *    Chia-I Wu <[email protected]>
  */
 
-#include "intel_winsys.h"
-
 #include "ilo_debug.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_zs.h"
 
 static bool
@@ -56,70 +55,9 @@ zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = 0;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
-static enum gen_depth_format
-get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   } else {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   }
-}
-
 static bool
 zs_validate_gen6(const struct ilo_dev *dev,
                  const struct ilo_state_zs_info *info)
@@ -128,63 +66,102 @@ zs_validate_gen6(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
+   assert(!info->z_img == !info->z_vma);
+   assert(!info->s_img == !info->s_vma);
+
+   /* all tiled */
+   if (info->z_img) {
+      assert(info->z_img->tiling == GEN6_TILING_Y);
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->s_img) {
+      assert(info->s_img->tiling == GEN8_TILING_W);
+      assert(info->s_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->hiz_vma) {
+      assert(info->z_img &&
+             ilo_image_can_enable_aux(info->z_img, info->level));
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 315:
     *
-    *      The stencil buffer has a format of S8_UINT, and shares Surface
+    *     "The stencil buffer has a format of S8_UINT, and shares Surface
     *      Type, Height, Width, and Depth, Minimum Array Element, Render
     *      Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth
-    *      Buffer Object Control State fields of the depth buffer.
+    *      Buffer Object Control State fields of the depth buffer."
     */
-   if (info->z_img == info->s_img) {
-      assert(info->z_img->target == info->s_img->target &&
-             info->z_img->width0 == info->s_img->width0 &&
+   if (info->z_img && info->s_img && info->z_img != info->s_img) {
+      assert(info->z_img->type == info->s_img->type &&
              info->z_img->height0 == info->s_img->height0 &&
              info->z_img->depth0 == info->s_img->depth0);
    }
 
-   assert(info->level < img->level_count);
-   assert(img->bo_stride);
-
-   if (info->hiz_enable) {
-      assert(info->z_img &&
-             ilo_image_can_enable_aux(info->z_img, info->level));
+   if (info->type != img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             img->type == GEN6_SURFTYPE_CUBE);
    }
 
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
-
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (info->format) {
+      case GEN6_ZFORMAT_D32_FLOAT:
+      case GEN6_ZFORMAT_D24_UNORM_X8_UINT:
+      case GEN6_ZFORMAT_D16_UNORM:
+         break;
+      default:
+         assert(!"unknown depth format");
+         break;
+      }
+   } else {
       /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+       * From the Ironlake PRM, volume 2 part 1, page 330:
+       *
+       *     "If this field (Separate Stencil Buffer Enable) is disabled, the
+       *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 321:
        *
-       *     "For cube maps, Width must be set equal to Height."
+       *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be
+       *      set to the same value (enabled or disabled) as Hierarchical
+       *      Depth Buffer Enable."
        */
-      assert(img->width0 == img->height0);
+      if (info->hiz_vma)
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_S8_UINT);
+      else
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_X8_UINT);
    }
 
-   if (info->z_img)
-      assert(info->z_img->tiling == GEN6_TILING_Y);
-   if (info->s_img)
-      assert(info->s_img->tiling == GEN8_TILING_W);
+   assert(info->level < img->level_count);
+   assert(img->bo_stride);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+    *
+    *     "For cube maps, Width must be set equal to Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
+      assert(img->width0 == img->height0);
 
    return true;
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+zs_get_gen6_max_extent(const struct ilo_dev *dev,
+                       const struct ilo_state_zs_info *info,
+                       uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -274,7 +251,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
    w = img->width0;
    h = img->height0;
 
-   if (info->hiz_enable) {
+   if (info->hiz_vma) {
       uint16_t align_w, align_h;
 
       get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h);
@@ -290,7 +267,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
       h = align(h, align_h);
    }
 
-   get_gen6_max_extent(dev, img, &max_w, &max_h);
+   zs_get_gen6_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -319,16 +296,17 @@ zs_get_gen6_depth_slices(const struct ilo_dev *dev,
     *      surfaces. If the volume texture is MIP-mapped, this field specifies
     *      the depth of the base MIP level."
     */
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
 
       assert(img->array_size <= max_slice);
       max_slice = img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          /*
           * Minumum Array Element and Depth must be 0; Render Target View
           * Extent is ignored.
@@ -408,8 +386,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_state_zs_info *info)
 {
    uint16_t width, height, depth, array_base, view_extent;
-   enum gen_surface_type type;
-   enum gen_depth_format format;
    uint32_t dw1, dw2, dw3, dw4;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -420,37 +396,15 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
-   /*
-    * From the Ironlake PRM, volume 2 part 1, page 330:
-    *
-    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
-    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
-    *
-    *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set
-    *      to the same value (enabled or disabled) as Hierarchical Depth
-    *      Buffer Enable."
-    */
-   if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
-      format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-
    /* info->z_readonly and info->s_readonly are ignored on Gen6 */
-   dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT |
+   dw1 = info->type << GEN6_DEPTH_DW1_TYPE__SHIFT |
          GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
-         format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+         info->format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img)
       dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
 
-   if (info->hiz_enable || !info->z_img) {
+   if (info->hiz_vma || !info->z_img) {
       dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
              GEN6_DEPTH_DW1_SEPARATE_STENCIL;
    }
@@ -471,8 +425,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -481,8 +433,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_zs_info *info)
 {
-   enum gen_surface_type type;
-   enum gen_depth_format format;
    uint16_t width, height, depth;
    uint16_t array_base, view_extent;
    uint32_t dw1, dw2, dw3, dw4, dw6;
@@ -495,20 +445,13 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
-   dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT |
-         format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+   dw1 = info->type << GEN7_DEPTH_DW1_TYPE__SHIFT |
+         info->format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img) {
       if (!info->z_readonly)
          dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
-      if (info->hiz_enable)
+      if (info->hiz_vma)
          dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
 
       dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT;
@@ -539,8 +482,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = dw6;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -683,11 +624,15 @@ ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev,
    else
       ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev);
 
-   if (info->z_img && info->hiz_enable)
+   if (info->z_img && info->hiz_vma)
       ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info);
    else
       ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
 
+   zs->z_vma = info->z_vma;
+   zs->s_vma = info->s_vma;
+   zs->hiz_vma = info->hiz_vma;
+
    zs->z_readonly = info->z_readonly;
    zs->s_readonly = info->s_readonly;
 
@@ -703,6 +648,8 @@ ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
    struct ilo_state_zs_info info;
 
    memset(&info, 0, sizeof(info));
+   info.type = GEN6_SURFTYPE_NULL;
+   info.format = GEN6_ZFORMAT_D32_FLOAT;
 
    return ilo_state_zs_init(zs, dev, &info);
 }
@@ -720,8 +667,11 @@ ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
     */
    assert(ilo_dev_gen(dev) >= ILO_GEN(7));
 
-   zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
-   zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+   if (zs->hiz_vma) {
+      zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
+      zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+      zs->hiz_vma = NULL;
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
index 98212daf74f..6a25a873897 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -29,28 +29,31 @@
 #define ILO_STATE_ZS_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+struct ilo_vma;
 struct ilo_image;
 
 struct ilo_state_zs_info {
-   /* both are optional */
+   /* both optional */
    const struct ilo_image *z_img;
    const struct ilo_image *s_img;
+   uint8_t level;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
+
+   enum gen_surface_type type;
+   enum gen_depth_format format;
 
    /* ignored prior to Gen7 */
    bool z_readonly;
    bool s_readonly;
-
-   bool hiz_enable;
-   bool is_cube_map;
-
-   uint8_t level;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_zs {
@@ -58,16 +61,12 @@ struct ilo_state_zs {
    uint32_t stencil[3];
    uint32_t hiz[3];
 
-   /* TODO move this to ilo_image */
-   enum gen_depth_format depth_format;
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
 
    bool z_readonly;
    bool s_readonly;
-
-   /* managed by users */
-   struct intel_bo *depth_bo;
-   struct intel_bo *stencil_bo;
-   struct intel_bo *hiz_bo;
 };
 
 bool
@@ -83,11 +82,4 @@ bool
 ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
                          const struct ilo_dev *dev);
 
-static inline enum gen_depth_format
-ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs,
-                              const struct ilo_dev *dev)
-{
-   return zs->depth_format;
-}
-
 #endif /* ILO_STATE_ZS_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_vma.h
index ca3c61ff890..ad2a1d4b33e 100644
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ b/src/gallium/drivers/ilo/core/ilo_vma.h
@@ -1,7 +1,7 @@
 /*
  * Mesa 3-D graphics library
  *
- * Copyright (C) 2012-2013 LunarG, Inc.
+ * Copyright (C) 2015 LunarG, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,40 +25,49 @@
  *    Chia-I Wu <[email protected]>
  */
 
-#ifndef ILO_BUFFER_H
-#define ILO_BUFFER_H
-
-#include "intel_winsys.h"
+#ifndef ILO_VMA_H
+#define ILO_VMA_H
 
 #include "ilo_core.h"
 #include "ilo_debug.h"
 #include "ilo_dev.h"
 
-struct ilo_buffer {
-   unsigned bo_size;
+struct intel_bo;
+
+/**
+ * A virtual memory area.
+ */
+struct ilo_vma {
+   /* address space */
+   uint32_t vm_size;
+   uint32_t vm_alignment;
 
-   /* managed by users */
+   /* backing storage */
    struct intel_bo *bo;
+   uint32_t bo_offset;
 };
 
-static inline void
-ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
-                unsigned size, uint32_t bind, uint32_t flags)
+static inline bool
+ilo_vma_init(struct ilo_vma *vma, const struct ilo_dev *dev,
+             uint32_t size, uint32_t alignment)
 {
-   assert(ilo_is_zeroed(buf, sizeof(*buf)));
+   assert(ilo_is_zeroed(vma, sizeof(*vma)));
+   assert(size && alignment);
+
+   vma->vm_alignment = alignment;
+   vma->vm_size = size;
 
-   buf->bo_size = size;
+   return true;
+}
+
+static inline void
+ilo_vma_set_bo(struct ilo_vma *vma, const struct ilo_dev *dev,
+               struct intel_bo *bo, uint32_t offset)
+{
+   assert(offset % vma->vm_alignment == 0);
 
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
-    *
-    *     "For buffers, which have no inherent "height," padding requirements
-    *      are different. A buffer must be padded to the next multiple of 256
-    *      array elements, with an additional 16 bytes added beyond that to
-    *      account for the L1 cache line."
-    */
-   if (bind & PIPE_BIND_SAMPLER_VIEW)
-      buf->bo_size = align(buf->bo_size, 256) + 16;
+   vma->bo = bo;
+   vma->bo_offset = offset;
 }
 
-#endif /* ILO_BUFFER_H */
+#endif /* ILO_VMA_H */
diff --git a/src/gallium/drivers/ilo/ilo_blitter_blt.c b/src/gallium/drivers/ilo/ilo_blitter_blt.c
index d55dc35e360..66203e86137 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_blt.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_blt.c
@@ -127,7 +127,7 @@ ilo_blitter_blt_end(struct ilo_blitter *blitter, uint32_t swctrl)
 
 static bool
 buf_clear_region(struct ilo_blitter *blitter,
-                 struct ilo_buffer *buf, unsigned offset,
+                 struct ilo_buffer_resource *buf, unsigned offset,
                  uint32_t val, unsigned size,
                  enum gen6_blt_mask value_mask,
                  enum gen6_blt_mask write_mask)
@@ -140,8 +140,8 @@ buf_clear_region(struct ilo_blitter *blitter,
    if (offset % cpp || size % cpp)
       return false;
 
-   dst.bo = buf->bo;
-   dst.offset = offset;
+   dst.bo = buf->vma.bo;
+   dst.offset = buf->vma.bo_offset + offset;
 
    ilo_blitter_blt_begin(blitter, GEN6_COLOR_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
@@ -179,25 +179,26 @@ buf_clear_region(struct ilo_blitter *blitter,
 
 static bool
 buf_copy_region(struct ilo_blitter *blitter,
-                struct ilo_buffer *dst_buf, unsigned dst_offset,
-                struct ilo_buffer *src_buf, unsigned src_offset,
+                struct ilo_buffer_resource *dst_buf, unsigned dst_offset,
+                struct ilo_buffer_resource *src_buf, unsigned src_offset,
                 unsigned size)
 {
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
    struct gen6_blt_bo dst, src;
 
-   dst.bo = dst_buf->bo;
-   dst.offset = dst_offset;
+   dst.bo = dst_buf->vma.bo;
+   dst.offset = dst_buf->vma.bo_offset + dst_offset;
    dst.pitch = 0;
 
-   src.bo = src_buf->bo;
-   src.offset = src_offset;
+   src.bo = src_buf->vma.bo;
+   src.offset = src_buf->vma.bo_offset + src_offset;
    src.pitch = 0;
 
    ilo_blitter_blt_begin(blitter, GEN6_SRC_COPY_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
-         dst_buf->bo, GEN6_TILING_NONE, src_buf->bo, GEN6_TILING_NONE);
+         dst_buf->vma.bo, GEN6_TILING_NONE,
+         src_buf->vma.bo, GEN6_TILING_NONE);
 
    while (size) {
       unsigned width, height;
@@ -258,14 +259,14 @@ tex_clear_region(struct ilo_blitter *blitter,
    if (dst_box->width * cpp > gen6_blt_max_bytes_per_scanline)
       return false;
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
    swctrl = ilo_blitter_blt_begin(blitter,
          GEN6_XY_COLOR_BLT__SIZE * dst_box->depth,
-         dst_tex->image.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
+         dst_tex->vma.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
 
    for (slice = 0; slice < dst_box->depth; slice++) {
       unsigned x, y;
@@ -299,7 +300,7 @@ tex_copy_region(struct ilo_blitter *blitter,
                 const struct pipe_box *src_box)
 {
    const struct util_format_description *desc =
-      util_format_description(dst_tex->image.format);
+      util_format_description(dst_tex->image_format);
    const unsigned max_extent = 32767; /* INT16_MAX */
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
@@ -347,13 +348,13 @@ tex_copy_region(struct ilo_blitter *blitter,
       break;
    }
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
-   src.bo = src_tex->image.bo;
-   src.offset = 0;
+   src.bo = src_tex->vma.bo;
+   src.offset = src_tex->vma.bo_offset;
    src.pitch = src_tex->image.bo_stride;
    src.tiling = src_tex->image.tiling;
 
@@ -423,8 +424,8 @@ ilo_blitter_blt_copy_resource(struct ilo_blitter *blitter,
              src_box->height == 1 &&
              src_box->depth == 1);
 
-      success = buf_copy_region(blitter,
-            ilo_buffer(dst), dst_offset, ilo_buffer(src), src_offset, size);
+      success = buf_copy_region(blitter, ilo_buffer_resource(dst), dst_offset,
+            ilo_buffer_resource(src), src_offset, size);
    }
    else if (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER) {
       success = tex_copy_region(blitter,
@@ -488,7 +489,7 @@ ilo_blitter_blt_clear_rt(struct ilo_blitter *blitter,
       if (offset + size > end)
          size = end - offset;
 
-      success = buf_clear_region(blitter, ilo_buffer(rt->texture),
+      success = buf_clear_region(blitter, ilo_buffer_resource(rt->texture),
             offset, packed.ui[0], size, mask, mask);
    }
    else {
diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
index 13c8f500680..86e67084d6e 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
@@ -318,7 +318,7 @@ hiz_can_clear_zs(const struct ilo_blitter *blitter,
     * The truth is when HiZ is enabled, separate stencil is also enabled on
     * all GENs.  The depth buffer format cannot be combined depth/stencil.
     */
-   switch (tex->image.format) {
+   switch (tex->image_format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (ilo_dev_gen(blitter->ilo->dev) == ILO_GEN(6) &&
           tex->base.width0 % 16)
@@ -355,7 +355,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
    if (ilo_dev_gen(blitter->ilo->dev) >= ILO_GEN(8))
       clear_value = fui(depth);
    else
-      clear_value = util_pack_z(tex->image.format, depth);
+      clear_value = util_pack_z(tex->image_format, depth);
 
    ilo_blit_resolve_surface(blitter->ilo, zs,
          ILO_TEXTURE_RENDER_WRITE | ILO_TEXTURE_CLEAR);
diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h
index 9ebbf76e81e..3dbe79fb872 100644
--- a/src/gallium/drivers/ilo/ilo_common.h
+++ b/src/gallium/drivers/ilo/ilo_common.h
@@ -28,6 +28,14 @@
 #ifndef ILO_COMMON_H
 #define ILO_COMMON_H
 
+#include "pipe/p_format.h"
+#include "pipe/p_defines.h"
+
+#include "util/list.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_pointer.h"
+
 #include "core/ilo_core.h"
 #include "core/ilo_debug.h"
 #include "core/ilo_dev.h"
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 3d5c7b636a8..b9a16aab81d 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -62,6 +62,8 @@ ilo_flush(struct pipe_context *pipe,
          (flags & PIPE_FLUSH_END_OF_FRAME) ? "frame end" : "user request");
 
    if (f) {
+      struct pipe_screen *screen = pipe->screen;
+      screen->fence_reference(screen, f, NULL);
       *f = ilo_screen_fence_create(pipe->screen, ilo->cp->last_submitted_bo);
    }
 }
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index e8e1a4cd14c..433348d9326 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -444,6 +444,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
                          const struct pipe_draw_info *info)
 {
    const struct ilo_ib_state *ib = &ilo->state_vector.ib;
+   const struct ilo_vma *vma;
    union {
       const void *ptr;
       const uint8_t *u8;
@@ -453,10 +454,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
    /* we will draw with IB mapped */
    if (ib->state.buffer) {
-      u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false);
+      vma = ilo_resource_get_vma(ib->state.buffer);
+      u.ptr = intel_bo_map(vma->bo, false);
       if (u.ptr)
-         u.u8 += ib->state.offset;
+         u.u8 += vma->bo_offset + ib->state.offset;
    } else {
+      vma = NULL;
       u.ptr = ib->state.user_buffer;
    }
 
@@ -500,8 +503,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
 #undef DRAW_VBO_WITH_SW_RESTART
 
-   if (ib->state.buffer)
-      intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo);
+   if (vma)
+      intel_bo_unmap(vma->bo);
 }
 
 static bool
diff --git a/src/gallium/drivers/ilo/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h
index 4e955c09c14..0a19c02659e 100644
--- a/src/gallium/drivers/ilo/ilo_format.h
+++ b/src/gallium/drivers/ilo/ilo_format.h
@@ -165,4 +165,39 @@ ilo_format_translate_vertex(const struct ilo_dev *dev,
    return ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
 }
 
+static inline enum gen_depth_format
+ilo_format_translate_depth(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   } else {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   }
+}
+
 #endif /* ILO_FORMAT_H */
diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c
index ad053564294..3bf8646b344 100644
--- a/src/gallium/drivers/ilo/ilo_render_surface.c
+++ b/src/gallium/drivers/ilo/ilo_render_surface.c
@@ -42,14 +42,17 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
                       const struct pipe_stream_output_info *so_info,
                       int so_index)
 {
-   struct ilo_buffer *buf = ilo_buffer(so->buffer);
    struct ilo_state_surface_buffer_info info;
    struct ilo_state_surface surf;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
    memset(&info, 0, sizeof(info));
-   info.buf = buf;
+
+   info.vma = ilo_resource_get_vma(so->buffer);
+   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
+   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB;
 
    switch (so_info->output[so_index].num_components) {
@@ -78,12 +81,9 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
 
    info.struct_size =
       so_info->stride[so_info->output[so_index].output_buffer] * 4;
-   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
-   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, builder->dev, &info);
-   surf.bo = info.buf->bo;
 
    return gen6_SURFACE_STATE(builder, &surf);
 }
@@ -482,18 +482,19 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
       return;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(session->input->buffer);
+
+   info.vma = ilo_resource_get_vma(session->input->buffer);
+   info.offset = session->input->buffer_offset;
+   info.size = session->input->buffer_size;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
    info.format = GEN6_FORMAT_RAW;
    info.format_size = 1;
    info.struct_size = 1;
    info.readonly = true;
-   info.offset = session->input->buffer_offset;
-   info.size = session->input->buffer_size;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-   surf.bo = info.buf->bo;
 
    assert(count == 1 && session->input->buffer);
    surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf);
@@ -538,23 +539,23 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r,
    surface_state += base;
    for (i = 0; i < count; i++) {
       if (i < vec->global_binding.count && bindings[i].resource) {
-         const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource);
          struct ilo_state_surface_buffer_info info;
          struct ilo_state_surface surf;
 
          assert(bindings[i].resource->target == PIPE_BUFFER);
 
          memset(&info, 0, sizeof(info));
-         info.buf = buf;
+
+         info.vma = ilo_resource_get_vma(bindings[i].resource);
+         info.size = info.vma->vm_size;
+
          info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
          info.format = GEN6_FORMAT_RAW;
          info.format_size = 1;
          info.struct_size = 1;
-         info.size = buf->bo_size;
 
          memset(&surf, 0, sizeof(surf));
          ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-         surf.bo = info.buf->bo;
 
          surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf);
       } else {
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index be9fd10a84c..9026ba9a983 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -25,7 +25,12 @@
  *    Chia-I Wu <[email protected]>
  */
 
+#include "core/ilo_state_vf.h"
+#include "core/ilo_state_sol.h"
+#include "core/ilo_state_surface.h"
+
 #include "ilo_screen.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 
 /*
@@ -83,6 +88,134 @@ resource_get_cpu_init(const struct pipe_resource *templ)
                           PIPE_BIND_STREAM_OUTPUT)) ? false : true;
 }
 
+static enum gen_surface_type
+get_surface_type(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_CUBE;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static enum pipe_format
+resource_get_image_format(const struct pipe_resource *templ,
+                          const struct ilo_dev *dev,
+                          bool *separate_stencil_ret)
+{
+   enum pipe_format format = templ->format;
+   bool separate_stencil;
+
+   /* silently promote ETC1 */
+   if (templ->format == PIPE_FORMAT_ETC1_RGB8)
+      format = PIPE_FORMAT_R8G8B8X8_UNORM;
+
+   /* separate stencil buffers */
+   separate_stencil = false;
+   if ((templ->bind & PIPE_BIND_DEPTH_STENCIL) &&
+       util_format_is_depth_and_stencil(templ->format)) {
+      switch (templ->format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         /* Gen6 requires HiZ to be available for all levels */
+         if (ilo_dev_gen(dev) >= ILO_GEN(7) || templ->last_level == 0) {
+            format = PIPE_FORMAT_Z32_FLOAT;
+            separate_stencil = true;
+         }
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         format = PIPE_FORMAT_Z24X8_UNORM;
+         separate_stencil = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (separate_stencil_ret)
+      *separate_stencil_ret = separate_stencil;
+
+   return format;
+}
+
+static inline enum gen_surface_format
+pipe_to_surface_format(const struct ilo_dev *dev, enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+   case PIPE_FORMAT_Z32_FLOAT:
+      return GEN6_FORMAT_R32_FLOAT;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return GEN6_FORMAT_R24_UNORM_X8_TYPELESS;
+   case PIPE_FORMAT_Z16_UNORM:
+      return GEN6_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_S8_UINT:
+      return GEN6_FORMAT_R8_UINT;
+   default:
+      return ilo_format_translate_color(dev, format);
+   }
+}
+
+static void
+resource_get_image_info(const struct pipe_resource *templ,
+                        const struct ilo_dev *dev,
+                        enum pipe_format image_format,
+                        struct ilo_image_info *info)
+{
+   memset(info, 0, sizeof(*info));
+
+   info->type = get_surface_type(templ->target);
+
+   info->format = pipe_to_surface_format(dev, image_format);
+   info->interleaved_stencil = util_format_is_depth_and_stencil(image_format);
+   info->is_integer = util_format_is_pure_integer(image_format);
+   info->compressed = util_format_is_compressed(image_format);
+   info->block_width = util_format_get_blockwidth(image_format);
+   info->block_height = util_format_get_blockheight(image_format);
+   info->block_size = util_format_get_blocksize(image_format);
+
+   info->width = templ->width0;
+   info->height = templ->height0;
+   info->depth = templ->depth0;
+   info->array_size = templ->array_size;
+   info->level_count = templ->last_level + 1;
+   info->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
+
+   info->aux_disable = (templ->usage == PIPE_USAGE_STAGING);
+
+   if (templ->bind & PIPE_BIND_LINEAR)
+      info->valid_tilings = 1 << GEN6_TILING_NONE;
+
+   /*
+    * Tiled images must be mapped via GTT to get a linear view.  Prefer linear
+    * images when the image size is greater than one-fourth of the mappable
+    * aperture.
+    */
+   if (templ->bind & (PIPE_BIND_TRANSFER_WRITE | PIPE_BIND_TRANSFER_READ))
+      info->prefer_linear_threshold = dev->aperture_mappable / 4;
+
+   info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW);
+   info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET);
+   info->bind_surface_dp_typed = (templ->bind &
+         (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE));
+   info->bind_zs = (templ->bind & PIPE_BIND_DEPTH_STENCIL);
+   info->bind_scanout = (templ->bind & PIPE_BIND_SCANOUT);
+   info->bind_cursor = (templ->bind & PIPE_BIND_CURSOR);
+}
+
 static enum gen_surface_tiling
 winsys_to_surface_tiling(enum intel_tiling_mode tiling)
 {
@@ -178,8 +311,8 @@ tex_create_bo(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   intel_bo_unref(tex->image.bo);
-   tex->image.bo = bo;
+   intel_bo_unref(tex->vma.bo);
+   ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -206,7 +339,7 @@ tex_create_separate_stencil(struct ilo_texture *tex)
 
    tex->separate_s8 = ilo_texture(s8);
 
-   assert(tex->separate_s8->image.format == PIPE_FORMAT_S8_UINT);
+   assert(tex->separate_s8->image_format == PIPE_FORMAT_S8_UINT);
 
    return true;
 }
@@ -215,15 +348,16 @@ static bool
 tex_create_hiz(struct ilo_texture *tex)
 {
    const struct pipe_resource *templ = &tex->base;
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    if (tex->imported) {
       unsigned lv;
@@ -246,17 +380,18 @@ tex_create_hiz(struct ilo_texture *tex)
 static bool
 tex_create_mcs(struct ilo_texture *tex)
 {
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
    assert(tex->image.aux.enables == (1 << (tex->base.last_level + 1)) - 1);
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -267,8 +402,8 @@ tex_destroy(struct ilo_texture *tex)
    if (tex->separate_s8)
       tex_destroy(tex->separate_s8);
 
-   intel_bo_unref(tex->image.bo);
-   intel_bo_unref(tex->image.aux.bo);
+   intel_bo_unref(tex->vma.bo);
+   intel_bo_unref(tex->aux_vma.bo);
 
    tex_free_slices(tex);
    FREE(tex);
@@ -277,24 +412,16 @@ tex_destroy(struct ilo_texture *tex)
 static bool
 tex_alloc_bos(struct ilo_texture *tex)
 {
-   struct ilo_screen *is = ilo_screen(tex->base.screen);
-
    if (!tex->imported && !tex_create_bo(tex))
       return false;
 
-   /* allocate separate stencil resource */
-   if (tex->image.separate_stencil && !tex_create_separate_stencil(tex))
-      return false;
-
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
-      if (!tex_create_hiz(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_hiz(tex))
          return false;
       break;
    case ILO_IMAGE_AUX_MCS:
-      if (!tex_create_mcs(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_mcs(tex))
          return false;
       break;
    default:
@@ -304,9 +431,10 @@ tex_alloc_bos(struct ilo_texture *tex)
    return true;
 }
 
-static bool
+static struct intel_bo *
 tex_import_handle(struct ilo_texture *tex,
-                  const struct winsys_handle *handle)
+                  const struct winsys_handle *handle,
+                  struct ilo_image_info *info)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
@@ -317,45 +445,94 @@ tex_import_handle(struct ilo_texture *tex,
 
    bo = intel_winsys_import_handle(is->dev.winsys, name, handle,
          tex->image.bo_height, &tiling, &pitch);
-   if (!bo)
-      return false;
+   /* modify image info */
+   if (bo) {
+      const uint8_t valid_tilings = 1 << winsys_to_surface_tiling(tiling);
 
-   if (!ilo_image_init_for_imported(&tex->image, &is->dev, templ,
-            winsys_to_surface_tiling(tiling), pitch)) {
-      ilo_err("failed to import handle for texture\n");
-      intel_bo_unref(bo);
-      return false;
-   }
+      if (info->valid_tilings && !(info->valid_tilings & valid_tilings)) {
+         intel_bo_unref(bo);
+         return NULL;
+      }
 
-   tex->image.bo = bo;
+      info->valid_tilings = valid_tilings;
+      info->force_bo_stride = pitch;
 
-   tex->imported = true;
+      /* assume imported RTs are also scanouts */
+      if (!info->bind_scanout)
+         info->bind_scanout = (templ->usage & PIPE_BIND_RENDER_TARGET);
+   }
 
-   return true;
+   return bo;
 }
 
 static bool
 tex_init_image(struct ilo_texture *tex,
-               const struct winsys_handle *handle)
+               const struct winsys_handle *handle,
+               bool *separate_stencil)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
    struct ilo_image *img = &tex->image;
+   struct intel_bo *imported_bo = NULL;;
+   struct ilo_image_info info;
+
+   tex->image_format = resource_get_image_format(templ,
+         &is->dev, separate_stencil);
+   resource_get_image_info(templ, &is->dev, tex->image_format, &info);
 
    if (handle) {
-      if (!tex_import_handle(tex, handle))
+      imported_bo = tex_import_handle(tex, handle, &info);
+      if (!imported_bo)
          return false;
-   } else {
-      ilo_image_init(img, &is->dev, templ);
    }
 
-   if (img->bo_height > ilo_max_resource_size / img->bo_stride)
+   if (!ilo_image_init(img, &is->dev, &info)) {
+      intel_bo_unref(imported_bo);
       return false;
+   }
+
+   /*
+    * HiZ requires 8x4 alignment and some levels might need HiZ disabled.  It
+    * is generally fine except on Gen6, where HiZ and separate stencil must be
+    * enabled together.  For PIPE_FORMAT_Z24X8_UNORM with separate stencil, we
+    * can live with stencil values being interleaved for levels where HiZ is
+    * disabled.  But it is not the case for PIPE_FORMAT_Z32_FLOAT with
+    * separate stencil.  If HiZ was disabled for a level, we had to change the
+    * format to PIPE_FORMAT_Z32_FLOAT_S8X24_UINT for the level and that format
+    * had a different bpp.  In other words, HiZ has to be available for all
+    * levels.
+    */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       tex->image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img->aux.enables != (1 << templ->last_level)) {
+      tex->image_format = templ->format;
+      info.format = pipe_to_surface_format(&is->dev, tex->image_format);
+      info.interleaved_stencil = true;
+
+      memset(img, 0, sizeof(*img));
+      if (!ilo_image_init(img, &is->dev, &info)) {
+         intel_bo_unref(imported_bo);
+         return false;
+      }
+   }
+
+   if (img->bo_height > ilo_max_resource_size / img->bo_stride ||
+       !ilo_vma_init(&tex->vma, &is->dev, img->bo_stride * img->bo_height,
+          4096)) {
+      intel_bo_unref(imported_bo);
+      return false;
+   }
+
+   if (imported_bo) {
+      ilo_vma_set_bo(&tex->vma, &is->dev, imported_bo, 0);
+      tex->imported = true;
+   }
 
    if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
       /* require on-the-fly tiling/untiling or format conversion */
-      if (img->tiling == GEN8_TILING_W || img->separate_stencil ||
-          img->format != templ->format)
+      if (img->tiling == GEN8_TILING_W || *separate_stencil ||
+          tex->image_format != templ->format)
          return false;
    }
 
@@ -371,6 +548,7 @@ tex_create(struct pipe_screen *screen,
            const struct winsys_handle *handle)
 {
    struct ilo_texture *tex;
+   bool separate_stencil;
 
    tex = CALLOC_STRUCT(ilo_texture);
    if (!tex)
@@ -380,12 +558,13 @@ tex_create(struct pipe_screen *screen,
    tex->base.screen = screen;
    pipe_reference_init(&tex->base.reference, 1);
 
-   if (!tex_init_image(tex, handle)) {
+   if (!tex_init_image(tex, handle, &separate_stencil)) {
       FREE(tex);
       return NULL;
    }
 
-   if (!tex_alloc_bos(tex)) {
+   if (!tex_alloc_bos(tex) ||
+       (separate_stencil && !tex_create_separate_stencil(tex))) {
       tex_destroy(tex);
       return NULL;
    }
@@ -406,7 +585,7 @@ tex_get_handle(struct ilo_texture *tex, struct winsys_handle *handle)
    else
       tiling = surface_to_winsys_tiling(tex->image.tiling);
 
-   err = intel_winsys_export_handle(is->dev.winsys, tex->image.bo, tiling,
+   err = intel_winsys_export_handle(is->dev.winsys, tex->vma.bo, tiling,
          tex->image.bo_stride, tex->image.bo_height, handle);
 
    return !err;
@@ -420,13 +599,12 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    const bool cpu_init = resource_get_cpu_init(&buf->base);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, name,
-         buf->buffer.bo_size, cpu_init);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, name, buf->bo_size, cpu_init);
    if (!bo)
       return false;
 
-   intel_bo_unref(buf->buffer.bo);
-   buf->buffer.bo = bo;
+   intel_bo_unref(buf->vma.bo);
+   ilo_vma_set_bo(&buf->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -434,7 +612,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
 static void
 buf_destroy(struct ilo_buffer_resource *buf)
 {
-   intel_bo_unref(buf->buffer.bo);
+   intel_bo_unref(buf->vma.bo);
    FREE(buf);
 }
 
@@ -443,6 +621,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
 {
    const struct ilo_screen *is = ilo_screen(screen);
    struct ilo_buffer_resource *buf;
+   uint32_t alignment;
    unsigned size;
 
    buf = CALLOC_STRUCT(ilo_buffer_resource);
@@ -471,10 +650,17 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
        ilo_dev_gen(&is->dev) < ILO_GEN(7.5))
       size = align(size, 4096);
 
-   ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
+   if (templ->bind & PIPE_BIND_VERTEX_BUFFER)
+      size = ilo_state_vertex_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_INDEX_BUFFER)
+      size = ilo_state_index_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_STREAM_OUTPUT)
+      size = ilo_state_sol_buffer_size(&is->dev, size, &alignment);
+
+   buf->bo_size = size;
+   ilo_vma_init(&buf->vma, &is->dev, buf->bo_size, 4096);
 
-   if (buf->buffer.bo_size < templ->width0 ||
-       buf->buffer.bo_size > ilo_max_resource_size ||
+   if (buf->bo_size < templ->width0 || buf->bo_size > ilo_max_resource_size ||
        !buf_create_bo(buf)) {
       FREE(buf);
       return NULL;
@@ -487,13 +673,30 @@ static boolean
 ilo_can_create_resource(struct pipe_screen *screen,
                         const struct pipe_resource *templ)
 {
+   struct ilo_screen *is = ilo_screen(screen);
+   enum pipe_format image_format;
+   struct ilo_image_info info;
    struct ilo_image img;
 
    if (templ->target == PIPE_BUFFER)
       return (templ->width0 <= ilo_max_resource_size);
 
+   image_format = resource_get_image_format(templ, &is->dev, NULL);
+   resource_get_image_info(templ, &is->dev, image_format, &info);
+
    memset(&img, 0, sizeof(img));
-   ilo_image_init(&img, &ilo_screen(screen)->dev, templ);
+   ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
+
+   /* as in tex_init_image() */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img.aux.enables != (1 << templ->last_level)) {
+      info.format = pipe_to_surface_format(&is->dev, templ->format);
+      info.interleaved_stencil = true;
+      memset(&img, 0, sizeof(img));
+      ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
+   }
 
    return (img.bo_height <= ilo_max_resource_size / img.bo_stride);
 }
diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h
index d602e0cbf70..8378af54741 100644
--- a/src/gallium/drivers/ilo/ilo_resource.h
+++ b/src/gallium/drivers/ilo/ilo_resource.h
@@ -29,8 +29,8 @@
 #define ILO_RESOURCE_H
 
 #include "core/intel_winsys.h"
-#include "core/ilo_buffer.h"
 #include "core/ilo_image.h"
+#include "core/ilo_vma.h"
 
 #include "ilo_common.h"
 #include "ilo_screen.h"
@@ -92,7 +92,10 @@ struct ilo_texture {
 
    bool imported;
 
+   enum pipe_format image_format;
    struct ilo_image image;
+   struct ilo_vma vma;
+   struct ilo_vma aux_vma;
 
    /* XXX thread-safety */
    struct ilo_texture_slice *slices[PIPE_MAX_TEXTURE_LEVELS];
@@ -103,14 +106,15 @@ struct ilo_texture {
 struct ilo_buffer_resource {
    struct pipe_resource base;
 
-   struct ilo_buffer buffer;
+   uint32_t bo_size;
+   struct ilo_vma vma;
 };
 
-static inline struct ilo_buffer *
-ilo_buffer(struct pipe_resource *res)
+static inline struct ilo_buffer_resource *
+ilo_buffer_resource(struct pipe_resource *res)
 {
-   return (res && res->target == PIPE_BUFFER) ?
-      &((struct ilo_buffer_resource *) res)->buffer : NULL;
+   return (struct ilo_buffer_resource *)
+      ((res && res->target == PIPE_BUFFER) ? res : NULL);
 }
 
 static inline struct ilo_texture *
@@ -127,13 +131,14 @@ bool
 ilo_resource_rename_bo(struct pipe_resource *res);
 
 /**
- * Return the bo of the resource.
+ * Return the VMA of the resource.
  */
-static inline struct intel_bo *
-ilo_resource_get_bo(struct pipe_resource *res)
+static inline const struct ilo_vma *
+ilo_resource_get_vma(struct pipe_resource *res)
 {
    return (res->target == PIPE_BUFFER) ?
-      ilo_buffer(res)->bo : ilo_texture(res)->image.bo;
+      &((struct ilo_buffer_resource *) res)->vma :
+      &((struct ilo_texture *) res)->vma;
 }
 
 static inline struct ilo_texture_slice *
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 94105559b80..ab4d1377c9f 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -193,6 +193,7 @@ ilo_get_compute_param(struct pipe_screen *screen,
       uint32_t max_clock_frequency;
       uint32_t max_compute_units;
       uint32_t images_supported;
+      uint32_t subgroup_size;
    } val;
    const void *ptr;
    int size;
@@ -284,6 +285,13 @@ ilo_get_compute_param(struct pipe_screen *screen,
       ptr = &val.images_supported;
       size = sizeof(val.images_supported);
       break;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      /* best case is actually SIMD32 */
+      val.subgroup_size = 16;
+
+      ptr = &val.subgroup_size;
+      size = sizeof(val.subgroup_size);
+      break;
    default:
       ptr = NULL;
       size = 0;
@@ -443,6 +451,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_SM5:
       return 0;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return true;
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
@@ -457,6 +467,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -665,13 +677,6 @@ ilo_screen_fence_finish(struct pipe_screen *screen,
    return signaled;
 }
 
-static boolean
-ilo_screen_fence_signalled(struct pipe_screen *screen,
-                           struct pipe_fence_handle *fence)
-{
-   return ilo_screen_fence_finish(screen, fence, 0);
-}
-
 /**
  * Create a fence for \p bo.  When \p bo is not NULL, it must be submitted
  * before waited on or checked.
@@ -738,7 +743,6 @@ ilo_screen_create(struct intel_winsys *ws)
    is->base.flush_frontbuffer = NULL;
 
    is->base.fence_reference = ilo_screen_fence_reference;
-   is->base.fence_signalled = ilo_screen_fence_signalled;
    is->base.fence_finish = ilo_screen_fence_finish;
 
    is->base.get_driver_query_info = NULL;
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 63534f33fa7..d89765a9d23 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -379,13 +379,12 @@ finalize_cbuf_state(struct ilo_context *ilo,
       u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
-      cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource);
+      cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
       cbuf->cso[i].info.offset = offset;
 
       memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface));
       ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface,
             ilo->dev, &cbuf->cso[i].info);
-      cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo;
 
       ilo->state_vector.dirty |= ILO_DIRTY_CBUF;
    }
@@ -466,11 +465,9 @@ finalize_index_buffer(struct ilo_context *ilo)
 
    memset(&info, 0, sizeof(info));
    if (vec->ib.hw_resource) {
-      info.buf = ilo_buffer(vec->ib.hw_resource);
-      info.size = info.buf->bo_size;
+      info.vma = ilo_resource_get_vma(vec->ib.hw_resource);
+      info.size = info.vma->vm_size;
       info.format = ilo_translate_index_size(vec->ib.hw_index_size);
-
-      vec->ib.ib.bo = info.buf->bo;
    }
 
    ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info);
@@ -532,13 +529,11 @@ finalize_vertex_buffers(struct ilo_context *ilo)
       const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx];
 
       if (cso->buffer) {
-         info.buf = ilo_buffer(cso->buffer);
+         info.vma = ilo_resource_get_vma(cso->buffer);
          info.offset = cso->buffer_offset;
-         info.size = info.buf->bo_size;
+         info.size = info.vma->vm_size - cso->buffer_offset;
 
          info.stride = cso->stride;
-
-         vec->vb.vb[i].bo = info.buf->bo;
       } else {
          memset(&info, 0, sizeof(info));
       }
@@ -1566,24 +1561,23 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
          cso->info.size = buf[i].buffer_size;
 
          if (buf[i].buffer) {
-            cso->info.buf = ilo_buffer(buf[i].buffer);
+            cso->info.vma = ilo_resource_get_vma(buf[i].buffer);
             cso->info.offset = buf[i].buffer_offset;
 
             memset(&cso->surface, 0, sizeof(cso->surface));
             ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info);
-            cso->surface.bo = cso->info.buf->bo;
 
             cso->user_buffer = NULL;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else if (buf[i].user_buffer) {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             /* buffer_offset does not apply for user buffer */
             cso->user_buffer = buf[i].user_buffer;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             cso->info.size = 0;
             cso->user_buffer = NULL;
 
@@ -1596,7 +1590,7 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 
          pipe_resource_reference(&cso->resource, NULL);
 
-         cso->info.buf = NULL;
+         cso->info.vma = NULL;
          cso->info.size = 0;
          cso->user_buffer = NULL;
 
@@ -1705,10 +1699,11 @@ ilo_set_framebuffer_state(struct pipe_context *pipe,
    if (state->zsbuf) {
       const struct ilo_surface_cso *cso =
          (const struct ilo_surface_cso *) state->zsbuf;
+      const struct ilo_texture *tex = ilo_texture(cso->base.texture);
 
-      fb->has_hiz = cso->u.zs.hiz_bo;
+      fb->has_hiz = cso->u.zs.hiz_vma;
       fb->depth_offset_format =
-         ilo_state_zs_get_depth_format(&cso->u.zs, dev);
+         ilo_format_translate_depth(dev, tex->image_format);
    } else {
       fb->has_hiz = false;
       fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT;
@@ -1854,10 +1849,11 @@ ilo_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 }
 
 static void
-ilo_set_shader_resources(struct pipe_context *pipe,
-                         unsigned start, unsigned count,
-                         struct pipe_surface **surfaces)
+ilo_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view **views)
 {
+#if 0
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
    struct ilo_resource_state *dst = &vec->resource;
    unsigned i;
@@ -1886,6 +1882,7 @@ ilo_set_shader_resources(struct pipe_context *pipe,
    }
 
    vec->dirty |= ILO_DIRTY_RESOURCE;
+#endif
 }
 
 static void
@@ -1945,12 +1942,11 @@ ilo_create_stream_output_target(struct pipe_context *pipe,
    target->base.buffer_size = buffer_size;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(res);
+   info.vma = ilo_resource_get_vma(res);
    info.offset = buffer_offset;
    info.size = buffer_size;
 
    ilo_state_sol_buffer_init(&target->sb, dev, &info);
-   target->sb.bo = info.buf->bo;
 
    return &target->base;
 }
@@ -2018,18 +2014,17 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       struct ilo_state_surface_buffer_info info;
 
       memset(&info, 0, sizeof(info));
-      info.buf = ilo_buffer(res);
+      info.vma = ilo_resource_get_vma(res);
+      info.offset = templ->u.buf.first_element * info.struct_size;
+      info.size = (templ->u.buf.last_element -
+            templ->u.buf.first_element + 1) * info.struct_size;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
       info.format = ilo_format_translate_color(dev, templ->format);
       info.format_size = util_format_get_blocksize(templ->format);
       info.struct_size = info.format_size;
       info.readonly = true;
-      info.offset = templ->u.buf.first_element * info.struct_size;
-      info.size = (templ->u.buf.last_element -
-            templ->u.buf.first_element + 1) * info.struct_size;
 
       ilo_state_surface_init_for_buffer(&view->surface, dev, &info);
-      view->surface.bo = info.buf->bo;
    } else {
       struct ilo_texture *tex = ilo_texture(res);
       struct ilo_state_surface_image_info info;
@@ -2042,32 +2037,31 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       }
 
       memset(&info, 0, sizeof(info));
+
       info.img = &tex->image;
+      info.level_base = templ->u.tex.first_level;
+      info.level_count = templ->u.tex.last_level -
+         templ->u.tex.first_level + 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
 
+      info.vma = &tex->vma;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+      info.type = tex->image.type;
 
       if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-          tex->image.separate_stencil) {
+          tex->separate_s8) {
          info.format = ilo_format_translate_texture(dev,
                PIPE_FORMAT_Z32_FLOAT);
       } else {
          info.format = ilo_format_translate_texture(dev, templ->format);
       }
 
-      info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE ||
-                          tex->image.target == PIPE_TEXTURE_CUBE_ARRAY);
       info.is_array = util_resource_is_array_texture(&tex->base);
       info.readonly = true;
 
-      info.level_base = templ->u.tex.first_level;
-      info.level_count = templ->u.tex.last_level -
-         templ->u.tex.first_level + 1;
-      info.slice_base = templ->u.tex.first_layer;
-      info.slice_count = templ->u.tex.last_layer -
-         templ->u.tex.first_layer + 1;
-
       ilo_state_surface_init_for_image(&view->surface, dev, &info);
-      view->surface.bo = info.img->bo;
    }
 
    return &view->base;
@@ -2111,18 +2105,27 @@ ilo_create_surface(struct pipe_context *pipe,
       assert(tex->base.target != PIPE_BUFFER);
 
       memset(&info, 0, sizeof(info));
+
       info.img = &tex->image;
-      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
-      info.format = ilo_format_translate_render(dev, templ->format);
-      info.is_array = util_resource_is_array_texture(&tex->base);
       info.level_base = templ->u.tex.level;
       info.level_count = 1;
       info.slice_base = templ->u.tex.first_layer;
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.vma = &tex->vma;
+      if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+         info.aux_vma = &tex->aux_vma;
+
+      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
+
+      info.format = ilo_format_translate_render(dev, templ->format);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+
       ilo_state_surface_init_for_image(&surf->u.rt, dev, &info);
-      surf->u.rt.bo = info.img->bo;
    } else {
       struct ilo_state_zs_info info;
 
@@ -2131,13 +2134,19 @@ ilo_create_surface(struct pipe_context *pipe,
       memset(&info, 0, sizeof(info));
 
       if (templ->format == PIPE_FORMAT_S8_UINT) {
+         info.s_vma = &tex->vma;
          info.s_img = &tex->image;
       } else {
+         info.z_vma = &tex->vma;
          info.z_img = &tex->image;
-         info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL;
 
-         info.hiz_enable =
-            ilo_image_can_enable_aux(&tex->image, templ->u.tex.level);
+         if (tex->separate_s8) {
+            info.s_vma = &tex->separate_s8->vma;
+            info.s_img = &tex->separate_s8->image;
+         }
+
+         if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+            info.hiz_vma = &tex->aux_vma;
       }
 
       info.level = templ->u.tex.level;
@@ -2145,16 +2154,15 @@ ilo_create_surface(struct pipe_context *pipe,
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
-      ilo_state_zs_init(&surf->u.zs, dev, &info);
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
 
-      if (info.z_img) {
-         surf->u.zs.depth_bo = info.z_img->bo;
-         if (info.hiz_enable)
-            surf->u.zs.hiz_bo = info.z_img->aux.bo;
-      }
+      info.format = ilo_format_translate_depth(dev, tex->image_format);
+      if (ilo_dev_gen(dev) == ILO_GEN(6) && !info.hiz_vma &&
+          tex->image_format == PIPE_FORMAT_Z24X8_UNORM)
+         info.format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
 
-      if (info.s_img)
-         surf->u.zs.stencil_bo = info.s_img->bo;
+      ilo_state_zs_init(&surf->u.zs, dev, &info);
    }
 
    return &surf->base;
@@ -2339,7 +2347,7 @@ ilo_init_state_functions(struct ilo_context *ilo)
    ilo->base.set_scissor_states = ilo_set_scissor_states;
    ilo->base.set_viewport_states = ilo_set_viewport_states;
    ilo->base.set_sampler_views = ilo_set_sampler_views;
-   ilo->base.set_shader_resources = ilo_set_shader_resources;
+   ilo->base.set_shader_images = ilo_set_shader_images;
    ilo->base.set_vertex_buffers = ilo_set_vertex_buffers;
    ilo->base.set_index_buffer = ilo_set_index_buffer;
 
@@ -2451,7 +2459,6 @@ void
 ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                                   struct pipe_resource *res)
 {
-   struct intel_bo *bo = ilo_resource_get_bo(res);
    uint32_t states = 0;
    unsigned sh, i;
 
@@ -2482,10 +2489,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
 
       for (i = 0; i < vec->so.count; i++) {
          if (vec->so.states[i]->buffer == res) {
-            struct ilo_stream_output_target *target =
-               (struct ilo_stream_output_target *) vec->so.states[i];
-
-            target->sb.bo = ilo_buffer(res)->bo;
             states |= ILO_DIRTY_SO;
             break;
          }
@@ -2503,7 +2506,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                [PIPE_SHADER_GEOMETRY]  = ILO_DIRTY_VIEW_GS,
                [PIPE_SHADER_COMPUTE]   = ILO_DIRTY_VIEW_CS,
             };
-            cso->surface.bo = bo;
 
             states |= view_dirty_bits[sh];
             break;
@@ -2515,7 +2517,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
             struct ilo_cbuf_cso *cbuf = &vec->cbuf[sh].cso[i];
 
             if (cbuf->resource == res) {
-               cbuf->surface.bo = bo;
                states |= ILO_DIRTY_CBUF;
                break;
             }
@@ -2528,7 +2529,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          (struct ilo_surface_cso *) vec->resource.states[i];
 
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_RESOURCE;
          break;
       }
@@ -2540,27 +2540,19 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          struct ilo_surface_cso *cso =
             (struct ilo_surface_cso *) vec->fb.state.cbufs[i];
          if (cso && cso->base.texture == res) {
-            cso->u.rt.bo = bo;
             states |= ILO_DIRTY_FB;
             break;
          }
       }
 
-      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res) {
-         struct ilo_surface_cso *cso =
-            (struct ilo_surface_cso *) vec->fb.state.zsbuf;
-
-         cso->u.zs.depth_bo = bo;
-
+      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res)
          states |= ILO_DIRTY_FB;
-      }
    }
 
    for (i = 0; i < vec->cs_resource.count; i++) {
       struct ilo_surface_cso *cso =
          (struct ilo_surface_cso *) vec->cs_resource.states[i];
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_CS_RESOURCE;
          break;
       }
diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h
index 3e6fd8a2554..66c93007eb1 100644
--- a/src/gallium/drivers/ilo/ilo_state.h
+++ b/src/gallium/drivers/ilo/ilo_state.h
@@ -202,7 +202,7 @@ struct ilo_cbuf_state {
 };
 
 struct ilo_resource_state {
-   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
+   struct pipe_surface *states[PIPE_MAX_SHADER_IMAGES];
    unsigned count;
 };
 
diff --git a/src/gallium/drivers/ilo/ilo_transfer.c b/src/gallium/drivers/ilo/ilo_transfer.c
index ec41473f94a..5abd3bebf68 100644
--- a/src/gallium/drivers/ilo/ilo_transfer.c
+++ b/src/gallium/drivers/ilo/ilo_transfer.c
@@ -100,7 +100,7 @@ resource_get_transfer_method(struct pipe_resource *res,
             m = ILO_TRANSFER_MAP_SW_ZS;
             need_convert = true;
          }
-      } else if (tex->image.format != tex->base.format) {
+      } else if (tex->image_format != tex->base.format) {
          m = ILO_TRANSFER_MAP_SW_CONVERT;
          need_convert = true;
       }
@@ -268,23 +268,27 @@ xfer_alloc_staging_sys(struct ilo_transfer *xfer)
 static void *
 xfer_map(struct ilo_transfer *xfer)
 {
+   const struct ilo_vma *vma;
    void *ptr;
 
    switch (xfer->method) {
    case ILO_TRANSFER_MAP_CPU:
-      ptr = intel_bo_map(ilo_resource_get_bo(xfer->base.resource),
-            xfer->base.usage & PIPE_TRANSFER_WRITE);
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map(vma->bo, xfer->base.usage & PIPE_TRANSFER_WRITE);
       break;
    case ILO_TRANSFER_MAP_GTT:
-      ptr = intel_bo_map_gtt(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt(vma->bo);
       break;
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      ptr = intel_bo_map_gtt_async(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt_async(vma->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
       {
          const struct ilo_screen *is = ilo_screen(xfer->staging.res->screen);
-         struct intel_bo *bo = ilo_resource_get_bo(xfer->staging.res);
+
+         vma = ilo_resource_get_vma(xfer->staging.res);
 
          /*
           * We want a writable, optionally persistent and coherent, mapping
@@ -292,25 +296,29 @@ xfer_map(struct ilo_transfer *xfer)
           * this turns out to be fairly simple.
           */
          if (is->dev.has_llc)
-            ptr = intel_bo_map(bo, true);
+            ptr = intel_bo_map(vma->bo, true);
          else
-            ptr = intel_bo_map_gtt(bo);
+            ptr = intel_bo_map_gtt(vma->bo);
 
          if (ptr && xfer->staging.res->target == PIPE_BUFFER)
             ptr += (xfer->base.box.x % ILO_TRANSFER_MAP_BUFFER_ALIGNMENT);
-
       }
       break;
    case ILO_TRANSFER_MAP_SW_CONVERT:
    case ILO_TRANSFER_MAP_SW_ZS:
+      vma = NULL;
       ptr = xfer->staging.sys;
       break;
    default:
       assert(!"unknown mapping method");
+      vma = NULL;
       ptr = NULL;
       break;
    }
 
+   if (ptr && vma)
+      ptr = (void *) ((char *) ptr + vma->bo_offset);
+
    return ptr;
 }
 
@@ -324,10 +332,10 @@ xfer_unmap(struct ilo_transfer *xfer)
    case ILO_TRANSFER_MAP_CPU:
    case ILO_TRANSFER_MAP_GTT:
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->base.resource));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->base.resource)->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->staging.res));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->staging.res)->bo);
       break;
    default:
       break;
@@ -541,9 +549,12 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 
    if (prefer_cpu && (tex->image.tiling == GEN6_TILING_NONE ||
                       !linear_view))
-      ptr = intel_bo_map(tex->image.bo, !for_read_back);
+      ptr = intel_bo_map(tex->vma.bo, !for_read_back);
    else
-      ptr = intel_bo_map_gtt(tex->image.bo);
+      ptr = intel_bo_map_gtt(tex->vma.bo);
+
+   if (ptr)
+      ptr = (void *) ((char *) ptr + tex->vma.bo_offset);
 
    return ptr;
 }
@@ -551,7 +562,7 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 static void
 tex_staging_sys_unmap_bo(struct ilo_texture *tex)
 {
-   intel_bo_unmap(tex->image.bo);
+   intel_bo_unmap(tex->vma.bo);
 }
 
 static bool
@@ -590,7 +601,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          dst_cpp = 4;
          dst_s8_pos = 3;
@@ -598,7 +609,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          dst_cpp = 8;
          dst_s8_pos = 4;
@@ -644,7 +655,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -717,7 +728,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          src_cpp = 4;
          src_s8_pos = 3;
@@ -725,7 +736,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          src_cpp = 8;
          src_s8_pos = 4;
@@ -771,7 +782,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -829,8 +840,8 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
    else
       dst_slice_stride = 0;
 
-   if (unlikely(tex->image.format == tex->base.format)) {
-      util_copy_box(dst, tex->image.format, tex->image.bo_stride,
+   if (unlikely(tex->image_format == tex->base.format)) {
+      util_copy_box(dst, tex->image_format, tex->image.bo_stride,
             dst_slice_stride, 0, 0, 0, box->width, box->height, box->depth,
             xfer->staging.sys, xfer->base.stride, xfer->base.layer_stride,
             0, 0, 0);
@@ -842,7 +853,7 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
 
    switch (tex->base.format) {
    case PIPE_FORMAT_ETC1_RGB8:
-      assert(tex->image.format == PIPE_FORMAT_R8G8B8X8_UNORM);
+      assert(tex->image_format == PIPE_FORMAT_R8G8B8X8_UNORM);
 
       for (slice = 0; slice < box->depth; slice++) {
          const void *src =
@@ -1055,7 +1066,7 @@ choose_transfer_method(struct ilo_context *ilo, struct ilo_transfer *xfer)
       return false;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, ilo_resource_get_bo(res), &need_submit)) {
+   if (is_bo_busy(ilo, ilo_resource_get_vma(res)->bo, &need_submit)) {
       bool resource_renamed;
 
       if (!xfer_unblock(xfer, &resource_renamed)) {
@@ -1078,11 +1089,11 @@ static void
 buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
            unsigned usage, int offset, int size, const void *data)
 {
-   struct ilo_buffer *buf = ilo_buffer(res);
+   struct ilo_buffer_resource *buf = ilo_buffer_resource(res);
    bool need_submit;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, buf->bo, &need_submit)) {
+   if (is_bo_busy(ilo, buf->vma.bo, &need_submit)) {
       bool unblocked = false;
 
       if ((usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) &&
@@ -1103,9 +1114,12 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          templ.bind = PIPE_BIND_TRANSFER_WRITE;
          staging = ilo->base.screen->resource_create(ilo->base.screen, &templ);
          if (staging) {
+            const struct ilo_vma *staging_vma = ilo_resource_get_vma(staging);
             struct pipe_box staging_box;
 
-            intel_bo_pwrite(ilo_buffer(staging)->bo, 0, size, data);
+            /* offset by staging_vma->bo_offset for pwrite */
+            intel_bo_pwrite(staging_vma->bo, staging_vma->bo_offset,
+                  size, data);
 
             u_box_1d(0, size, &staging_box);
             ilo_blitter_blt_copy_resource(ilo->blitter,
@@ -1123,7 +1137,8 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          ilo_cp_submit(ilo->cp, "syncing for pwrites");
    }
 
-   intel_bo_pwrite(buf->bo, offset, size, data);
+   /* offset by buf->vma.bo_offset for pwrite */
+   intel_bo_pwrite(buf->vma.bo, buf->vma.bo_offset + offset, size, data);
 }
 
 static void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1de43f77ee0..1feb415c9e5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -78,7 +78,7 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 /**
  * Whether the blending factors are complementary of each other.
  */
-static INLINE boolean
+static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
    return dst_factor == (src_factor ^ 0x10);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 0d47c0d517c..c273b25f096 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -169,7 +169,7 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen,
                             unsigned bind_flags);
 
 
-static INLINE struct llvmpipe_context *
+static inline struct llvmpipe_context *
 llvmpipe_context( struct pipe_context *pipe )
 {
    return (struct llvmpipe_context *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index e0f7d8e1bc3..1038c5fe151 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -71,7 +71,7 @@ extern int LP_DEBUG;
 
 void st_debug_init( void );
 
-static INLINE void
+static inline void
 LP_DBG( unsigned flag, const char *fmt, ... )
 {
     if (LP_DEBUG & flag)
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 3c591187801..d7f0c153ec8 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -72,7 +72,7 @@ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
 void
 lp_fence_destroy(struct lp_fence *fence);
 
-static INLINE void
+static inline void
 lp_fence_reference(struct lp_fence **ptr,
                    struct lp_fence *f)
 {
@@ -85,7 +85,7 @@ lp_fence_reference(struct lp_fence **ptr,
    *ptr = f;
 }
 
-static INLINE boolean
+static inline boolean
 lp_fence_issued(const struct lp_fence *fence)
 {
    return fence->issued;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c209f47f0f5..c19f9318006 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -184,7 +184,7 @@ union lp_rast_cmd_arg {
 
 /* Cast wrappers.  Hopefully these compile to noops!
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
 {
    union lp_rast_cmd_arg arg;
@@ -192,7 +192,7 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
                       unsigned plane_mask)
 {
@@ -208,7 +208,7 @@ lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
  * All planes are enabled, so instead of the plane mask we pass the upper
  * left coordinates of the a block that fully encloses the triangle.
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
                                 unsigned x, unsigned y)
 {
@@ -218,7 +218,7 @@ lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_state( const struct lp_rast_state *state )
 {
    union lp_rast_cmd_arg arg;
@@ -226,7 +226,7 @@ lp_rast_arg_state( const struct lp_rast_state *state )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_fence( struct lp_fence *fence )
 {
    union lp_rast_cmd_arg arg;
@@ -235,7 +235,7 @@ lp_rast_arg_fence( struct lp_fence *fence )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 {
    union lp_rast_cmd_arg arg;
@@ -245,7 +245,7 @@ lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_query( struct llvmpipe_query *pq )
 {
    union lp_rast_cmd_arg arg;
@@ -253,7 +253,7 @@ lp_rast_arg_query( struct llvmpipe_query *pq )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_null( void )
 {
    union lp_rast_cmd_arg arg;
@@ -312,7 +312,7 @@ lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
-static INLINE __m128i
+static inline __m128i
 lp_plane_to_m128i(const struct lp_rast_plane *plane)
 {
    return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index e6ebbcd526d..9aa7e874657 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -145,7 +145,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 color block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned buf, unsigned x, unsigned y,
                                 unsigned layer)
@@ -186,7 +186,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 depth block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned x, unsigned y, unsigned layer)
 {
@@ -222,7 +222,7 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
  * triangle in/out tests.
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE void
+static inline void
 lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          const struct lp_rast_shader_inputs *inputs,
                          unsigned x, unsigned y )
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 41f6fbfa059..c9b9221d87c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -63,7 +63,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 {
    unsigned mask = 0;
@@ -94,7 +94,7 @@ build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 }
 
 
-static INLINE void
+static inline void
 build_masks(int64_t c,
             int64_t cdiff,
             int64_t dcdx,
@@ -167,7 +167,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #include "util/u_sse.h"
 
 
-static INLINE void
+static inline void
 build_masks_32(int c, 
                int cdiff,
                int dcdx,
@@ -213,7 +213,7 @@ build_masks_32(int c,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear_32(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
@@ -239,7 +239,7 @@ build_mask_linear_32(int c, int dcdx, int dcdy)
    return _mm_movemask_epi8(result);
 }
 
-static INLINE unsigned
+static inline unsigned
 sign_bits4(const __m128i *cstep, int cdiff)
 {
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index a226ff0c485..b1464bb54c4 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -207,7 +207,7 @@ boolean lp_scene_is_resource_referenced(const struct lp_scene *scene,
  * Allocate space for a command/data in the bin's data buffer.
  * Grow the block list if needed.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -240,7 +240,7 @@ lp_scene_alloc( struct lp_scene *scene, unsigned size)
 /**
  * As above, but with specific alignment.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 			unsigned alignment )
 {
@@ -272,7 +272,7 @@ lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 
 /* Put back data if we decide not to use it, eg. culled triangles.
  */
-static INLINE void
+static inline void
 lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -282,7 +282,7 @@ lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 
 
 /** Return pointer to a particular tile's bin. */
-static INLINE struct cmd_bin *
+static inline struct cmd_bin *
 lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
 {
    return &scene->tile[x][y];
@@ -296,7 +296,7 @@ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
 
 /* Add a command to bin[x][y].
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_command( struct lp_scene *scene,
                       unsigned x, unsigned y,
                       unsigned cmd,
@@ -328,7 +328,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
-static INLINE boolean
+static inline boolean
 lp_scene_bin_cmd_with_state( struct lp_scene *scene,
                              unsigned x, unsigned y,
                              const struct lp_rast_state *state,
@@ -354,7 +354,7 @@ lp_scene_bin_cmd_with_state( struct lp_scene *scene,
 
 /* Add a command to all active bins.
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_everywhere( struct lp_scene *scene,
 			 unsigned cmd,
 			 const union lp_rast_cmd_arg arg )
@@ -371,7 +371,7 @@ lp_scene_bin_everywhere( struct lp_scene *scene,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 lp_scene_get_num_bins( const struct lp_scene *scene )
 {
    return scene->tiles_x * scene->tiles_y;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 47f1897c732..14eeab03387 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -288,10 +288,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
@@ -529,18 +533,6 @@ llvmpipe_fence_reference(struct pipe_screen *screen,
 
 
 /**
- * Has the fence been executed/finished?
- */
-static boolean
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   struct lp_fence *f = (struct lp_fence *) fence;
-   return lp_fence_signalled(f);
-}
-
-
-/**
  * Wait for the fence to finish.
  */
 static boolean
@@ -550,6 +542,9 @@ llvmpipe_fence_finish(struct pipe_screen *screen,
 {
    struct lp_fence *f = (struct lp_fence *) fence_handle;
 
+   if (!timeout)
+      return lp_fence_signalled(f);
+
    lp_fence_wait(f);
    return TRUE;
 }
@@ -601,7 +596,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
    screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
    screen->base.fence_reference = llvmpipe_fence_reference;
-   screen->base.fence_signalled = llvmpipe_fence_signalled;
    screen->base.fence_finish = llvmpipe_fence_finish;
 
    screen->base.get_timestamp = llvmpipe_get_timestamp;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h
index 8b8ea1afac9..00bf20c8c5f 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.h
+++ b/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -62,7 +62,7 @@ struct llvmpipe_screen
 
 
 
-static INLINE struct llvmpipe_screen *
+static inline struct llvmpipe_screen *
 llvmpipe_screen( struct pipe_screen *pipe )
 {
    return (struct llvmpipe_screen *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index c944ad26756..a42df2dc9e0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -159,7 +159,7 @@ void
 lp_setup_end_query(struct lp_setup_context *setup,
                    struct llvmpipe_query *pq);
 
-static INLINE unsigned
+static inline unsigned
 lp_clamp_viewport_idx(int idx)
 {
    return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 6c05b90e64a..a190254d9df 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -233,7 +233,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
 
 
-static INLINE int subpixel_snap( float a )
+static inline int subpixel_snap( float a )
 {
    return util_iround(FIXED_ONE * a);
 }
@@ -262,14 +262,14 @@ print_line(struct lp_setup_context *setup,
 }
 
 
-static INLINE boolean sign(float x){
+static inline boolean sign(float x){
    return x >= 0;  
 }  
 
 
 /* Used on positive floats only:
  */
-static INLINE float fracf(float f)
+static inline float fracf(float f)
 {
    return f - floorf(f);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index f065676a7fb..75544b52493 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -296,7 +296,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 }
 
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a2f55ed3a1e..98a9d4bc28b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -48,13 +48,13 @@
 #include <emmintrin.h>
 #endif
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
 }
 
-static INLINE float
+static inline float
 fixed_to_float(int a)
 {
    return a * (1.0f / FIXED_ONE);
@@ -579,7 +579,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
  *
  * Undefined if no bit set exists, so code should check against 0 first.
  */
-static INLINE uint32_t 
+static inline uint32_t 
 floor_pot(uint32_t n)
 {
 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
@@ -841,7 +841,7 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,
 /**
  * Calculate fixed position data for a triangle
  */
-static INLINE void
+static inline void
 calc_fixed_position( struct lp_setup_context *setup,
                      struct fixed_position* position,
                      const float (*v0)[4],
@@ -873,7 +873,7 @@ calc_fixed_position( struct lp_setup_context *setup,
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[0] and xy[1]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_01( struct fixed_position* position )
 {
    int x, y;
@@ -898,7 +898,7 @@ rotate_fixed_position_01( struct fixed_position* position )
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[1] and xy[2]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_12( struct fixed_position* position )
 {
    int x, y;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 89992007849..534c5f48a64 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -122,7 +122,7 @@ lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
 
 typedef const float (*const_float4_ptr)[4];
 
-static INLINE const_float4_ptr get_vert( const void *vertex_buffer,
+static inline const_float4_ptr get_vert( const void *vertex_buffer,
                                          int index,
                                          int stride )
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b5ce8683f1a..fd6c49aacd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -840,7 +840,7 @@ store_unswizzled_block(struct gallivm_state *gallivm,
  *
  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
  */
-static INLINE boolean
+static inline boolean
 is_arithmetic_format(const struct util_format_description *format_desc)
 {
    boolean arith = false;
@@ -860,7 +860,7 @@ is_arithmetic_format(const struct util_format_description *format_desc)
  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
  * SoA conversion.
  */
-static INLINE boolean
+static inline boolean
 format_expands_to_float_soa(const struct util_format_description *format_desc)
 {
    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
@@ -876,7 +876,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc)
  *
  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
  */
-static INLINE void
+static inline void
 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
                              struct lp_type* type)
 {
@@ -924,7 +924,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
  *
  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
  */
-static INLINE void
+static inline void
 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
                                struct lp_type* type)
 {
@@ -996,7 +996,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
  *
  * but we try to avoid division and multiplication through shifts.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 scale_bits(struct gallivm_state *gallivm,
            int src_bits,
            int dst_bits,
@@ -1108,7 +1108,7 @@ scale_bits(struct gallivm_state *gallivm,
 /**
  * If RT is a smallfloat (needing denorms) format
  */
-static INLINE int
+static inline int
 have_smallfloat_format(struct lp_type dst_type,
                        enum pipe_format format)
 {
@@ -2880,7 +2880,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
 /**
  * Return the blend factor equivalent to a destination alpha of one.
  */
-static INLINE unsigned
+static inline unsigned
 force_dst_alpha_one(unsigned factor, boolean clamped_zero)
 {
    switch(factor) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 4b6c8a7a6a5..e1b51c9c9a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -77,7 +77,7 @@ unsigned __int64 __rdtsc();
 
 #elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
 
-static INLINE uint64_t
+static inline uint64_t
 rdtsc(void)
 {
    uint32_t hi, lo;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index 9fbd3a21648..3d315bb9a73 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -106,21 +106,21 @@ struct llvmpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct llvmpipe_resource *
+static inline struct llvmpipe_resource *
 llvmpipe_resource(struct pipe_resource *pt)
 {
    return (struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE const struct llvmpipe_resource *
+static inline const struct llvmpipe_resource *
 llvmpipe_resource_const(const struct pipe_resource *pt)
 {
    return (const struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE struct llvmpipe_transfer *
+static inline struct llvmpipe_transfer *
 llvmpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct llvmpipe_transfer *) pt;
@@ -131,7 +131,7 @@ void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
 void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -153,7 +153,7 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 }
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -175,7 +175,7 @@ llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_layer_stride(struct pipe_resource *resource,
                       unsigned level)
 {
@@ -185,7 +185,7 @@ llvmpipe_layer_stride(struct pipe_resource *resource,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_resource_stride(struct pipe_resource *resource,
                          unsigned level)
 {
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index d05f0a17ab4..c52d62e54a2 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index ca3c806e92f..cce60550ae5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1153,8 +1153,8 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
 
    switch (info->type) {
    PROG_TYPE_CASE(VERTEX, VERTEX);
-// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
-// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+   PROG_TYPE_CASE(TESS_CTRL, TESSELLATION_CONTROL);
+   PROG_TYPE_CASE(TESS_EVAL, TESSELLATION_EVAL);
    PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
    PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
    PROG_TYPE_CASE(COMPUTE, COMPUTE);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 529dcb9bdc2..3ddaeafebbd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -106,6 +106,7 @@ enum operation
    OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
    OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
    OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+   OP_AFETCH, // fetch base address of shader input (a[%r1+0x10])
    OP_EXPORT,
    OP_LINTERP,
    OP_PINTERP,
@@ -372,7 +373,8 @@ enum SVSemantic
    SV_SAMPLE_INDEX,
    SV_SAMPLE_POS,
    SV_SAMPLE_MASK,
-   SV_TESS_FACTOR,
+   SV_TESS_OUTER,
+   SV_TESS_INNER,
    SV_TESS_COORD,
    SV_TID,
    SV_CTAID,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index 51b9225156b..fa8ee072a92 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -332,6 +332,9 @@ BasicBlock::splitBefore(Instruction *insn, bool attach)
    BasicBlock *bb = new BasicBlock(func);
    assert(!insn || insn->op != OP_PHI);
 
+   bb->joinAt = joinAt;
+   joinAt = NULL;
+
    splitCommon(insn, bb, attach);
    return bb;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 708c5b322ee..19418c0e0f1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -428,8 +428,7 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
 {
    Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
 
-   assert(svIndex < 4 ||
-          (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+   assert(svIndex < 4 || svName == SV_CLIP_DISTANCE);
 
    switch (svName) {
    case SV_POSITION:
@@ -438,7 +437,9 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
    case SV_POINT_SIZE:
    case SV_POINT_COORD:
    case SV_CLIP_DISTANCE:
-   case SV_TESS_FACTOR:
+   case SV_TESS_OUTER:
+   case SV_TESS_INNER:
+   case SV_TESS_COORD:
       sym->reg.type = TYPE_F32;
       break;
    default:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index dba56bf2716..2b9edcf9172 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -69,18 +69,6 @@ struct nv50_ir_varying
 # define NV50_IR_DEBUG_REG_ALLOC 0
 #endif
 
-#define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
-#define NV50_SEMANTIC_TESSFACTOR    (TGSI_SEMANTIC_COUNT + 7)
-#define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
-#define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
-
-#define NV50_TESS_PART_FRACT_ODD  0
-#define NV50_TESS_PART_FRACT_EVEN 1
-#define NV50_TESS_PART_POW2       2
-#define NV50_TESS_PART_INTEGER    3
-
-#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
-
 struct nv50_ir_prog_symbol
 {
    uint32_t label;
@@ -151,10 +139,10 @@ struct nv50_ir_prog_info
       } gp;
       struct {
          unsigned numColourResults;
-         boolean writesDepth;
-         boolean earlyFragTests;
-         boolean separateFragData;
-         boolean usesDiscard;
+         bool writesDepth;
+         bool earlyFragTests;
+         bool separateFragData;
+         bool usesDiscard;
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
@@ -180,11 +168,11 @@ struct nv50_ir_prog_info
       int8_t viewportId;         /* output index of ViewportIndex */
       uint8_t fragDepth;         /* output index of FragDepth */
       uint8_t sampleMask;        /* output index of SampleMask */
-      boolean sampleInterp;      /* perform sample interp on all fp inputs */
+      bool sampleInterp;         /* perform sample interp on all fp inputs */
       uint8_t backFaceColor[2];  /* input/output indices of back face colour */
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
-      boolean fp64;              /* program uses fp64 math */
-      boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+      bool fp64;                 /* program uses fp64 math */
+      bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index ab8bf2e5504..f06056f8f17 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -77,6 +77,7 @@ private:
    void emitMOV(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -120,6 +121,8 @@ private:
 
    void emitPIXLD(const Instruction *);
 
+   void emitBAR(const Instruction *);
+
    void emitFlow(const Instruction *);
 
    inline void defId(const ValueDef&, const int pos);
@@ -1250,6 +1253,13 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
 }
 
 void
+CodeEmitterGK110::emitBAR(const Instruction *i)
+{
+   /* TODO */
+   emitNOP(i);
+}
+
+void
 CodeEmitterGK110::emitFlow(const Instruction *i)
 {
    const FlowInstruction *f = i->asFlow();
@@ -1330,6 +1340,23 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
 }
 
 void
+CodeEmitterGK110::emitAFETCH(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset & 0x7ff;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7d000000 | (offset >> 9);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[1] |= 0x8;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
+void
 CodeEmitterGK110::emitPFETCH(const Instruction *i)
 {
    uint32_t prim = i->src(0).get()->reg.data.u32;
@@ -1698,6 +1725,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_EXPORT:
       emitEXPORT(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
@@ -1856,6 +1886,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
       emitNOP(insn);
       insn->join = 1;
       break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 399a6f1db13..ef5c87d0437 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -174,6 +174,7 @@ private:
    void emitALD();
    void emitAST();
    void emitISBERD();
+   void emitAL2P();
    void emitIPA();
 
    void emitPIXLD();
@@ -2204,6 +2205,17 @@ CodeEmitterGM107::emitISBERD()
 }
 
 void
+CodeEmitterGM107::emitAL2P()
+{
+   emitInsn (0xefa00000);
+   emitField(0x2f, 2, (insn->getDef(0)->reg.size / 4) - 1);
+   emitO    (0x20);
+   emitField(0x14, 11, insn->src(0).get()->reg.data.offset);
+   emitGPR  (0x08, insn->src(0).getIndirect(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
+void
 CodeEmitterGM107::emitIPA()
 {
    int ipam = 0, ipas = 0;
@@ -2441,8 +2453,14 @@ CodeEmitterGM107::emitTXQ()
       break;
    }
 
-   emitInsn (0xdf4a0000);
-   emitField(0x24, 13, insn->tex.r);
+   if (insn->tex.rIndirectSrc >= 0) {
+      emitInsn (0xdf500000);
+   } else {
+      emitInsn (0xdf480000);
+      emitField(0x24, 13, insn->tex.r);
+   }
+
+   emitField(0x31, 1, insn->tex.liveOnly);
    emitField(0x1f, 4, insn->tex.mask);
    emitField(0x16, 6, type);
    emitGPR  (0x08, insn->src(0));
@@ -2753,6 +2771,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_PFETCH:
       emitISBERD();
       break;
+   case OP_AFETCH:
+      emitAL2P();
+      break;
    case OP_LINTERP:
    case OP_PINTERP:
       emitIPA();
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 1bfc8e32e84..67ea6df773c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -499,10 +499,14 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i)
    setSrc(i, 2, 2);
 
    if (i->getIndirect(0, 0)) {
-      assert(!i->getIndirect(1, 0));
+      assert(!i->srcExists(1) || !i->getIndirect(1, 0));
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 0);
-   } else {
+   } else if (i->srcExists(1) && i->getIndirect(1, 0)) {
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 1);
+   } else {
+      setAReg16(i, 2);
    }
 }
 
@@ -546,7 +550,7 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i)
 }
 
 // usual immediate form
-// - 1 to 3 sources where last is immediate (rir, gir)
+// - 1 to 3 sources where second is immediate (rir, gir)
 // - no address or predicate possible
 void
 CodeEmitterNV50::emitForm_IMM(const Instruction *i)
@@ -562,7 +566,7 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i)
    if (Target::operationSrcNr[i->op] > 1) {
       setSrc(i, 0, 0);
       setImmediate(i, 1);
-      setSrc(i, 2, 1);
+      // If there is another source, it has to be the same as the dest reg.
    } else {
       setImmediate(i, 0);
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 472e3a84119..f607f3ba3ec 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -85,6 +85,7 @@ private:
    void emitCCTL(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -1450,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       ImmediateValue *imm = i->getSrc(0)->asImm();
       assert(imm);
       code[0] |= imm->reg.data.u32 << 20;
+      code[1] |= 0x8000;
    }
 
    // thread count
@@ -1460,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       assert(imm);
       code[0] |= imm->reg.data.u32 << 26;
       code[1] |= imm->reg.data.u32 >> 6;
+      code[1] |= 0x4000;
    }
 
    if (i->srcExists(2) && (i->predSrc != 2)) {
@@ -1494,6 +1497,21 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
 }
 
 void
+CodeEmitterNVC0::emitAFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x0c000000 | (i->src(0).get()->reg.data.offset & 0x7ff);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+}
+
+void
 CodeEmitterNVC0::emitPFETCH(const Instruction *i)
 {
    uint32_t prim = i->src(0).get()->reg.data.u32;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index ecd115f9807..4847a0f3355 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -372,6 +372,10 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_SAMPLEPOS:  return nv50_ir::SV_SAMPLE_POS;
    case TGSI_SEMANTIC_SAMPLEMASK: return nv50_ir::SV_SAMPLE_MASK;
    case TGSI_SEMANTIC_INVOCATIONID: return nv50_ir::SV_INVOCATION_ID;
+   case TGSI_SEMANTIC_TESSCOORD:  return nv50_ir::SV_TESS_COORD;
+   case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
+   case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
+   case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
@@ -434,7 +438,6 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_USLT:
    case TGSI_OPCODE_USNE:
    case TGSI_OPCODE_USHR:
-   case TGSI_OPCODE_UCMP:
    case TGSI_OPCODE_ATOMUADD:
    case TGSI_OPCODE_ATOMXCHG:
    case TGSI_OPCODE_ATOMCAS:
@@ -827,7 +830,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
 
-   mainTempsInLMem = FALSE;
+   mainTempsInLMem = false;
 }
 
 Source::~Source()
@@ -938,7 +941,7 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
       info->prop.gp.instanceCount = prop->u[0].Data;
       break;
    case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-      info->prop.fp.separateFragData = TRUE;
+      info->prop.fp.separateFragData = true;
       break;
    case TGSI_PROPERTY_FS_COORD_ORIGIN:
    case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
@@ -947,6 +950,24 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
    case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
       info->io.genUserClip = -1;
       break;
+   case TGSI_PROPERTY_TCS_VERTICES_OUT:
+      info->prop.tp.outputPatchSize = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_PRIM_MODE:
+      info->prop.tp.domain = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_SPACING:
+      info->prop.tp.partitioning = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_VERTEX_ORDER_CW:
+      info->prop.tp.winding = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_POINT_MODE:
+      if (prop->u[0].Data)
+         info->prop.tp.outputPrim = PIPE_PRIM_POINTS;
+      else
+         info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;
@@ -1035,6 +1056,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                if (decl->Interp.Location || info->io.sampleInterp)
                   info->in[i].centroid = 1;
             }
+
+            if (sn == TGSI_SEMANTIC_PATCH)
+               info->in[i].patch = 1;
+            if (sn == TGSI_SEMANTIC_PATCH)
+               info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
          }
       }
       break;
@@ -1069,6 +1095,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          case TGSI_SEMANTIC_VIEWPORT_INDEX:
             info->io.viewportId = i;
             break;
+         case TGSI_SEMANTIC_PATCH:
+            info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
+            /* fallthrough */
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->out[i].patch = 1;
+            break;
          default:
             break;
          }
@@ -1092,6 +1125,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          info->sv[i].sn = sn;
          info->sv[i].si = si;
          info->sv[i].input = inferSysValDirection(sn);
+
+         switch (sn) {
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->sv[i].patch = 1;
+            break;
+         }
       }
       break;
    case TGSI_FILE_RESOURCE:
@@ -1156,7 +1196,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       } else
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       }
    }
 
@@ -1164,12 +1204,22 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       Instruction::SrcRegister src = insn.getSrc(s);
       if (src.getFile() == TGSI_FILE_TEMPORARY) {
          if (src.isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       } else
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
+      } else
+      if (src.getFile() == TGSI_FILE_OUTPUT) {
+         if (src.isIndirect(0)) {
+            // We don't know which one is accessed, just mark everything for
+            // reading. This is an extremely unlikely occurrence.
+            for (unsigned i = 0; i < info->numOutputs; ++i)
+               info->out[i].oread = 1;
+         } else {
+            info->out[src.getIndex(0)].oread = 1;
+         }
       }
       if (src.getFile() != TGSI_FILE_INPUT)
          continue;
@@ -1246,6 +1296,7 @@ private:
 
    Value *shiftAddress(Value *);
    Value *getVertexBase(int s);
+   Value *getOutputBase(int s);
    DataArray *getArrayForFile(unsigned file, int idx);
    Value *fetchSrc(int s, int c);
    Value *acquireDst(int d, int c);
@@ -1343,6 +1394,8 @@ private:
    Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
    uint8_t vtxBaseValid;
 
+   Value *outBase; // base address of vertex out patch (for TCP)
+
    Stack condBBs;  // fork BB, then else clause BB
    Stack joinBBs;  // fork BB, for inserting join ops on ENDIF
    Stack loopBBs;  // loop headers
@@ -1476,6 +1529,22 @@ Converter::getVertexBase(int s)
 }
 
 Value *
+Converter::getOutputBase(int s)
+{
+   assert(s < 5);
+   if (!(vtxBaseValid & (1 << s))) {
+      Value *offset = loadImm(NULL, tgsi.getSrc(s).getIndex(1));
+      if (tgsi.getSrc(s).isIndirect(1))
+         offset = mkOp2v(OP_ADD, TYPE_U32, getSSA(),
+                         fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL),
+                         offset);
+      vtxBaseValid |= 1 << s;
+      vtxBase[s] = mkOp2v(OP_ADD, TYPE_U32, getSSA(), outBase, offset);
+   }
+   return vtxBase[s];
+}
+
+Value *
 Converter::fetchSrc(int s, int c)
 {
    Value *res;
@@ -1488,6 +1557,9 @@ Converter::fetchSrc(int s, int c)
 
    if (src.is2D()) {
       switch (src.getFile()) {
+      case TGSI_FILE_OUTPUT:
+         dimRel = getOutputBase(s);
+         break;
       case TGSI_FILE_INPUT:
          dimRel = getVertexBase(s);
          break;
@@ -1542,6 +1614,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
    const int idx2d = src.is2D() ? src.getIndex(1) : 0;
    const int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
+   Instruction *ld;
 
    switch (src.getFile()) {
    case TGSI_FILE_IMMEDIATE:
@@ -1569,13 +1642,19 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          if (ptr)
             return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
       }
-      return mkLoadv(TYPE_U32, srcToSym(src, c), shiftAddress(ptr));
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->in[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_OUTPUT:
-      assert(!"load from output file");
-      return NULL;
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->out[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_SYSTEM_VALUE:
       assert(!ptr);
-      return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld->perPatch = info->sv[idx].patch;
+      return ld->getDef(0);
    default:
       return getArrayForFile(src.getFile(), idx2d)->load(
          sub.cur->values, idx, swz, shiftAddress(ptr));
@@ -1645,7 +1724,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
              viewport != NULL)
             mkOp1(OP_MOV, TYPE_U32, viewport, val);
          else
-            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val)->perPatch =
+               info->out[idx].patch;
       }
    } else
    if (f == TGSI_FILE_TEMPORARY ||
@@ -1687,6 +1767,7 @@ Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
    join->fixed = 1;
    conv->insertHead(join);
 
+   assert(!fork->joinAt);
    fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
    fork->insertBefore(fork->getExit(), fork->joinAt);
 }
@@ -1728,7 +1809,7 @@ Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
    }
    tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
 
-   setTexRS(tex, c, 1, -1);
+   setTexRS(tex, ++c, 1, -1);
 
    bb->insertTail(tex);
 }
@@ -2569,6 +2650,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       }
       break;
    case TGSI_OPCODE_UCMP:
+      srcTy = TYPE_U32;
+      /* fallthrough */
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);
@@ -3282,10 +3365,21 @@ Converter::run()
          clipVtx[c] = getScratch();
    }
 
-   if (prog->getType() == Program::TYPE_FRAGMENT) {
+   switch (prog->getType()) {
+   case Program::TYPE_TESSELLATION_CONTROL:
+      outBase = mkOp2v(
+         OP_SUB, TYPE_U32, getSSA(),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0)));
+      break;
+   case Program::TYPE_FRAGMENT: {
       Symbol *sv = mkSysVal(SV_POSITION, 3);
       fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
       mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+      break;
+   }
+   default:
+      break;
    }
 
    if (info->io.viewportId >= 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 596ac95d489..1f3fce2bb9a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -176,7 +176,7 @@ GM107LoweringPass::handlePOPCNT(Instruction *i)
                            i->getSrc(0), i->getSrc(1));
    i->setSrc(0, tmp);
    i->setSrc(1, NULL);
-   return TRUE;
+   return true;
 }
 
 //
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 2c7f7e326b2..bea293bac99 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -871,6 +871,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i)
    BasicBlock *joinBB = i->bb->splitAfter(i);
 
    bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 
    for (int l = 0; l <= 3; ++l) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 7a5d1ce0299..c3c302da5c8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
+      } else
+      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
+          prog->getType() != Program::TYPE_COMPUTE) {
+         // It seems like barriers are never required for tessellation since
+         // the warp size is 32, and there are always at most 32 tcs threads.
+         bb->remove(i);
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !
@@ -956,7 +962,43 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
-   // TODO: indirect resource/sampler index
+   if (txq->tex.rIndirectSrc < 0)
+      return true;
+
+   Value *ticRel = txq->getIndirectR();
+   const int chipset = prog->getTarget()->getChipset();
+
+   txq->setIndirectS(NULL);
+   txq->tex.sIndirectSrc = -1;
+
+   assert(ticRel);
+
+   if (chipset < NVISA_GK104_CHIPSET) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      txq->setSrc(txq->tex.rIndirectSrc, NULL);
+      if (txq->tex.r)
+         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                             ticRel, bld.mkImm(txq->tex.r));
+
+      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
+
+      txq->moveSources(0, 1);
+      txq->setSrc(0, src);
+   } else {
+      Value *hnd = loadTexHandle(
+            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       txq->getIndirectR(), bld.mkImm(2)),
+            txq->tex.r);
+      txq->tex.r = 0xff;
+      txq->tex.s = 0x1f;
+
+      txq->setIndirectR(NULL);
+      txq->moveSources(0, 1);
+      txq->setSrc(0, hnd);
+      txq->tex.rIndirectSrc = 0;
+   }
+
    return true;
 }
 
@@ -1485,6 +1527,10 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
          i->op = OP_MOV;
          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
       }
+      if (sv == SV_VERTEX_COUNT) {
+         bld.setPosition(i, true);
+         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
+      }
       return true;
    }
 
@@ -1554,7 +1600,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
    default:
-      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
       ld = bld.mkFetch(i->getDef(0), i->dType,
                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
@@ -1705,6 +1751,7 @@ NVC0LoweringPass::checkPredicate(Instruction *insn)
 bool
 NVC0LoweringPass::visit(Instruction *i)
 {
+   bool ret = true;
    bld.setPosition(i, false);
 
    if (i->cc != CC_ALWAYS)
@@ -1736,7 +1783,8 @@ NVC0LoweringPass::visit(Instruction *i)
    case OP_SQRT:
       return handleSQRT(i);
    case OP_EXPORT:
-      return handleEXPORT(i);
+      ret = handleEXPORT(i);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       return handleOUT(i);
@@ -1775,6 +1823,9 @@ NVC0LoweringPass::visit(Instruction *i)
             i->setIndirect(0, 0, ptr);
             i->subOp = NV50_IR_SUBOP_LDC_IS;
          }
+      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+         i->op = OP_VFETCH;
       }
       break;
    case OP_ATOM:
@@ -1796,7 +1847,20 @@ NVC0LoweringPass::visit(Instruction *i)
    default:
       break;
    }
-   return true;
+
+   /* Kepler+ has a special opcode to compute a new base address to be used
+    * for indirect loads.
+    */
+   if (targ->getChipset() >= NVISA_GK104_CHIPSET && !i->perPatch &&
+       (i->op == OP_VFETCH || i->op == OP_EXPORT) && i->src(0).isIndirect(0)) {
+      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
+                                      cloneShallow(func, i->getSrc(0)));
+      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
+      i->src(0).get()->reg.data.offset = 0;
+      i->setIndirect(0, 0, afetch->getDef(0));
+   }
+
+   return ret;
 }
 
 bool
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index ae739eeda83..cea96dcdfc5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -608,9 +608,12 @@ ConstantFolding::expr(Instruction *i,
    case OP_FMA: {
       i->op = OP_ADD;
 
+      /* Move the immediate to the second arg, otherwise the ADD operation
+       * won't be emittable
+       */
       i->setSrc(1, i->getSrc(0));
-      i->src(1).mod = i->src(2).mod;
       i->setSrc(0, i->getSrc(2));
+      i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
       ImmediateValue src0;
@@ -2082,6 +2085,8 @@ MemoryOpt::runOpt(BasicBlock *bb)
       }
       if (ldst->getPredicate()) // TODO: handle predicated ld/st
          continue;
+      if (ldst->perPatch) // TODO: create separate per-patch lists
+         continue;
 
       if (isLoad) {
          DataFile file = ldst->src(0).getFile();
@@ -2515,6 +2520,8 @@ Instruction::isResultEqual(const Instruction *that) const
       case FILE_MEMORY_CONST:
       case FILE_SHADER_INPUT:
          return true;
+      case FILE_SHADER_OUTPUT:
+         return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
       default:
          return false;
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index ef3de6ff92a..9ebdc6586db 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -135,6 +135,7 @@ const char *operationStr[OP_LAST + 1] =
    "membar",
    "vfetch",
    "pfetch",
+   "afetch",
    "export",
    "linterp",
    "pinterp",
@@ -258,7 +259,8 @@ static const char *SemanticStr[SV_LAST + 1] =
    "SAMPLE_INDEX",
    "SAMPLE_POS",
    "SAMPLE_MASK",
-   "TESS_FACTOR",
+   "TESS_OUTER",
+   "TESS_INNER",
    "TESS_COORD",
    "TID",
    "CTAID",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 898653c9953..78bc97f4397 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2066,6 +2066,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
          condenseDefs(i);
          if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
             addHazard(i, i->src(0).getIndirect(0));
+         if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(1));
       } else
       if (i->op == OP_UNION ||
           i->op == OP_MERGE ||
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 7992f539782..fe530c76b62 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -41,7 +41,7 @@ const uint8_t Target::operationSrcNr[] =
    0, 0, 0, 0, 0,          // BRA, CALL, RET, CONT, BREAK,
    0, 0, 0,                // PRERET,CONT,BREAK
    0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
-   1, 1, 2, 1, 2,          // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+   1, 1, 1, 2, 1, 2,       // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP
    1, 1,                   // EMIT, RESTART
    1, 1, 1,                // TEX, TXB, TXL,
    1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
@@ -96,8 +96,8 @@ const OpClass Target::operationClass[] =
    OPCLASS_FLOW, OPCLASS_FLOW,
    // MEMBAR
    OPCLASS_CONTROL,
-   // VFETCH, PFETCH, EXPORT
-   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+   // VFETCH, PFETCH, AFETCH, EXPORT
+   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE,
    // LINTERP, PINTERP
    OPCLASS_SFU, OPCLASS_SFU,
    // EMIT, RESTART
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index ca545a6024a..f3ddcaa5199 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -118,7 +118,7 @@ void TargetNV50::initOpInfo()
    static const uint32_t shortForm[(OP_LAST + 31) / 32] =
    {
       // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
-      0x00014e40, 0x00000040, 0x00000498, 0x00000000
+      0x00014e40, 0x00000040, 0x00000930, 0x00000000
    };
    static const operation noDestList[] =
    {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 7d4a859dde4..27df0eba66b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -286,7 +286,8 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
    case SV_POINT_COORD:    return 0x2e0 + idx * 4;
    case SV_FACE:           return 0x3fc;
-   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_OUTER:     return 0x000 + idx * 4;
+   case SV_TESS_INNER:     return 0x010 + idx * 4;
    case SV_TESS_COORD:     return 0x2f0 + idx * 4;
    case SV_NTID:           return kepler ? (0x00 + idx * 4) : ~0;
    case SV_NCTAID:         return kepler ? (0x0c + idx * 4) : ~0;
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 09cdbb53ecb..67e181e803a 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -22,13 +22,13 @@ struct nouveau_transfer {
    uint32_t offset;
 };
 
-static INLINE struct nouveau_transfer *
+static inline struct nouveau_transfer *
 nouveau_transfer(struct pipe_transfer *transfer)
 {
    return (struct nouveau_transfer *)transfer;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_malloc(struct nv04_resource *buf)
 {
    if (!buf->data)
@@ -36,16 +36,11 @@ nouveau_buffer_malloc(struct nv04_resource *buf)
    return !!buf->data;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_allocate(struct nouveau_screen *screen,
                         struct nv04_resource *buf, unsigned domain)
 {
-   uint32_t size = buf->base.width0;
-
-   if (buf->base.bind & (PIPE_BIND_CONSTANT_BUFFER |
-                         PIPE_BIND_COMPUTE_RESOURCE |
-                         PIPE_BIND_SHADER_RESOURCE))
-      size = align(size, 0x100);
+   uint32_t size = align(buf->base.width0, 0x100);
 
    if (domain == NOUVEAU_BO_VRAM) {
       buf->mm = nouveau_mm_allocate(screen->mm_VRAM, size,
@@ -58,12 +53,12 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
       buf->mm = nouveau_mm_allocate(screen->mm_GART, size,
                                     &buf->bo, &buf->offset);
       if (!buf->bo)
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(screen, buf_obj_current_bytes_sys, buf->base.width0);
    } else {
       assert(domain == 0);
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    }
    buf->domain = domain;
    if (buf->bo)
@@ -71,10 +66,10 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
 
    util_range_set_empty(&buf->valid_buffer_range);
 
-   return TRUE;
+   return true;
 }
 
-static INLINE void
+static inline void
 release_allocation(struct nouveau_mm_allocation **mm,
                    struct nouveau_fence *fence)
 {
@@ -82,7 +77,7 @@ release_allocation(struct nouveau_mm_allocation **mm,
    (*mm) = NULL;
 }
 
-INLINE void
+inline void
 nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
 {
    nouveau_bo_ref(NULL, &buf->bo);
@@ -98,7 +93,7 @@ nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
    buf->domain = 0;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_reallocate(struct nouveau_screen *screen,
                           struct nv04_resource *buf, unsigned domain)
 {
@@ -139,13 +134,13 @@ nouveau_buffer_destroy(struct pipe_screen *pscreen,
  */
 static uint8_t *
 nouveau_transfer_staging(struct nouveau_context *nv,
-                         struct nouveau_transfer *tx, boolean permit_pb)
+                         struct nouveau_transfer *tx, bool permit_pb)
 {
    const unsigned adj = tx->base.box.x & NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK;
    const unsigned size = align(tx->base.box.width, 4) + adj;
 
    if (!nv->push_data)
-      permit_pb = FALSE;
+      permit_pb = false;
 
    if ((size <= NOUVEAU_TRANSFER_PUSHBUF_THRESHOLD) && permit_pb) {
       tx->map = align_malloc(size, NOUVEAU_MIN_BUFFER_MAP_ALIGN);
@@ -167,7 +162,7 @@ nouveau_transfer_staging(struct nouveau_context *nv,
  * buffer. Also updates buf->data if present.
  *
  * Maybe just migrate to GART right away if we actually need to do this. */
-static boolean
+static bool
 nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
 {
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
@@ -180,12 +175,12 @@ nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
                  buf->bo, buf->offset + base, buf->domain, size);
 
    if (nouveau_bo_wait(tx->bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
 
    if (buf->data)
       memcpy(buf->data + base, tx->map, size);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -195,7 +190,7 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
    uint8_t *data = tx->map + offset;
    const unsigned base = tx->base.box.x + offset;
-   const boolean can_cb = !((base | size) & 3);
+   const bool can_cb = !((base | size) & 3);
 
    if (buf->data)
       memcpy(data, buf->data + base, size);
@@ -224,32 +219,32 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
 /* Does a CPU wait for the buffer's backing data to become reliably accessible
  * for write/read by waiting on the buffer's relevant fences.
  */
-static INLINE boolean
+static inline bool
 nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
       if (!buf->fence_wr)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence_wr));
       if (!nouveau_fence_wait(buf->fence_wr))
-         return FALSE;
+         return false;
    } else {
       if (!buf->fence)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence));
       if (!nouveau_fence_wait(buf->fence))
-         return FALSE;
+         return false;
 
       nouveau_fence_ref(NULL, &buf->fence);
    }
    nouveau_fence_ref(NULL, &buf->fence_wr);
 
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ)
@@ -258,7 +253,7 @@ nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
       return (buf->fence && !nouveau_fence_signalled(buf->fence));
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
                              struct pipe_resource *resource,
                              const struct pipe_box *box,
@@ -280,7 +275,7 @@ nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
    tx->map = NULL;
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_del(struct nouveau_context *nv,
                             struct nouveau_transfer *tx)
 {
@@ -297,11 +292,11 @@ nouveau_buffer_transfer_del(struct nouveau_context *nv,
 }
 
 /* Creates a cache in system memory of the buffer data. */
-static boolean
+static bool
 nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 {
    struct nouveau_transfer tx;
-   boolean ret;
+   bool ret;
    tx.base.resource = &buf->base;
    tx.base.box.x = 0;
    tx.base.box.width = buf->base.width0;
@@ -310,13 +305,13 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 
    if (!buf->data)
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    if (!(buf->status & NOUVEAU_BUFFER_STATUS_DIRTY))
-      return TRUE;
+      return true;
    nv->stats.buf_cache_count++;
 
-   if (!nouveau_transfer_staging(nv, &tx, FALSE))
-      return FALSE;
+   if (!nouveau_transfer_staging(nv, &tx, false))
+      return false;
 
    ret = nouveau_transfer_read(nv, &tx);
    if (ret) {
@@ -335,15 +330,15 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
  * resource. This can be useful if we would otherwise have to wait for a read
  * operation to complete on this data.
  */
-static INLINE boolean
+static inline bool
 nouveau_buffer_should_discard(struct nv04_resource *buf, unsigned usage)
 {
    if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE))
-      return FALSE;
+      return false;
    if (unlikely(buf->base.bind & PIPE_BIND_SHARED))
-      return FALSE;
+      return false;
    if (unlikely(usage & PIPE_TRANSFER_PERSISTENT))
-      return FALSE;
+      return false;
    return buf->mm && nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE);
 }
 
@@ -413,7 +408,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
           * back into VRAM on unmap. */
          if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
             buf->status &= NOUVEAU_BUFFER_STATUS_REALLOC_MASK;
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
       } else {
          if (buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             /* The GPU is currently writing to this buffer. Copy its current
@@ -424,13 +419,13 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
                align_free(buf->data);
                buf->data = NULL;
             }
-            nouveau_transfer_staging(nv, tx, FALSE);
+            nouveau_transfer_staging(nv, tx, false);
             nouveau_transfer_read(nv, tx);
          } else {
             /* The buffer is currently idle. Create a staging area for writes,
              * and make sure that the cached data is up-to-date. */
             if (usage & PIPE_TRANSFER_WRITE)
-               nouveau_transfer_staging(nv, tx, TRUE);
+               nouveau_transfer_staging(nv, tx, true);
             if (!buf->data)
                nouveau_buffer_cache(nv, buf);
          }
@@ -482,7 +477,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
          /* The whole range is being discarded, so it doesn't matter what was
           * there before. No need to copy anything over. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          map = tx->map;
       } else
       if (nouveau_buffer_busy(buf, PIPE_TRANSFER_READ)) {
@@ -493,7 +488,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       } else {
          /* It is expected that the returned buffer be a representation of the
           * data in question, so we must copy it over from the buffer. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          if (tx->map)
             memcpy(tx->map, map, box->width);
          map = tx->map;
@@ -544,7 +539,7 @@ nouveau_buffer_transfer_unmap(struct pipe_context *pipe,
          const uint8_t bind = buf->base.bind;
          /* make sure we invalidate dedicated caches */
          if (bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
-            nv->vbo_dirty = TRUE;
+            nv->vbo_dirty = true;
       }
 
       util_range_add(&buf->valid_buffer_range,
@@ -639,7 +634,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
 {
    struct nouveau_screen *screen = nouveau_screen(pscreen);
    struct nv04_resource *buffer;
-   boolean ret;
+   bool ret;
 
    buffer = CALLOC_STRUCT(nv04_resource);
    if (!buffer)
@@ -683,7 +678,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    }
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
-   if (ret == FALSE)
+   if (ret == false)
       goto fail;
 
    if (buffer->domain == NOUVEAU_BO_VRAM && screen->hint_buf_keep_sysmem_copy)
@@ -730,20 +725,20 @@ nouveau_user_buffer_create(struct pipe_screen *pscreen, void *ptr,
    return &buffer->base;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_data_fetch(struct nouveau_context *nv, struct nv04_resource *buf,
                           struct nouveau_bo *bo, unsigned offset, unsigned size)
 {
    if (!nouveau_buffer_malloc(buf))
-      return FALSE;
+      return false;
    if (nouveau_bo_map(bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
    memcpy(buf->data, (uint8_t *)bo->map + offset, size);
-   return TRUE;
+   return true;
 }
 
 /* Migrate a linear buffer (vertex, index, constants) USER -> GART -> VRAM. */
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *nv,
                        struct nv04_resource *buf, const unsigned new_domain)
 {
@@ -758,7 +753,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
 
    if (new_domain == NOUVEAU_BO_GART && old_domain == 0) {
       if (!nouveau_buffer_allocate(screen, buf, new_domain))
-         return FALSE;
+         return false;
       ret = nouveau_bo_map(buf->bo, 0, nv->client);
       if (ret)
          return ret;
@@ -771,7 +766,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
       if (new_domain == NOUVEAU_BO_VRAM) {
          /* keep a system memory copy of our data in case we hit a fallback */
          if (!nouveau_buffer_data_fetch(nv, buf, buf->bo, buf->offset, size))
-            return FALSE;
+            return false;
          if (nouveau_mesa_debug)
             debug_printf("migrating %u KiB to VRAM\n", size / 1024);
       }
@@ -792,28 +787,28 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
    if (new_domain == NOUVEAU_BO_VRAM && old_domain == 0) {
       struct nouveau_transfer tx;
       if (!nouveau_buffer_allocate(screen, buf, NOUVEAU_BO_VRAM))
-         return FALSE;
+         return false;
       tx.base.resource = &buf->base;
       tx.base.box.x = 0;
       tx.base.box.width = buf->base.width0;
       tx.bo = NULL;
       tx.map = NULL;
-      if (!nouveau_transfer_staging(nv, &tx, FALSE))
-         return FALSE;
+      if (!nouveau_transfer_staging(nv, &tx, false))
+         return false;
       nouveau_transfer_write(nv, &tx, 0, tx.base.box.width);
       nouveau_buffer_transfer_del(nv, &tx);
    } else
-      return FALSE;
+      return false;
 
    assert(buf->domain == new_domain);
-   return TRUE;
+   return true;
 }
 
 /* Migrate data from glVertexAttribPointer(non-VBO) user buffers to GART.
  * We'd like to only allocate @size bytes here, but then we'd have to rebase
  * the vertex indices ...
  */
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *nv,
                            struct nv04_resource *buf,
                            unsigned base, unsigned size)
@@ -825,20 +820,20 @@ nouveau_user_buffer_upload(struct nouveau_context *nv,
 
    buf->base.width0 = base + size;
    if (!nouveau_buffer_reallocate(screen, buf, NOUVEAU_BO_GART))
-      return FALSE;
+      return false;
 
    ret = nouveau_bo_map(buf->bo, 0, nv->client);
    if (ret)
-      return FALSE;
+      return false;
    memcpy((uint8_t *)buf->bo->map + buf->offset + base, buf->data + base, size);
 
-   return TRUE;
+   return true;
 }
 
 
 /* Scratch data allocation. */
 
-static INLINE int
+static inline int
 nouveau_scratch_bo_alloc(struct nouveau_context *nv, struct nouveau_bo **pbo,
                          unsigned size)
 {
@@ -875,7 +870,7 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 /* Allocate an extra bo if we can't fit everything we need simultaneously.
  * (Could happen for very large user arrays.)
  */
-static INLINE boolean
+static inline bool
 nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 {
    int ret;
@@ -909,7 +904,7 @@ nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 /* Continue to next scratch buffer, if available (no wrapping, large enough).
  * Allocate it if it has not yet been created.
  */
-static INLINE boolean
+static inline bool
 nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
 {
    struct nouveau_bo *bo;
@@ -917,14 +912,14 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    const unsigned i = (nv->scratch.id + 1) % NOUVEAU_MAX_SCRATCH_BUFS;
 
    if ((size > nv->scratch.bo_size) || (i == nv->scratch.wrap))
-      return FALSE;
+      return false;
    nv->scratch.id = i;
 
    bo = nv->scratch.bo[i];
    if (!bo) {
       ret = nouveau_scratch_bo_alloc(nv, &bo, nv->scratch.bo_size);
       if (ret)
-         return FALSE;
+         return false;
       nv->scratch.bo[i] = bo;
    }
    nv->scratch.current = bo;
@@ -937,10 +932,10 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    return !ret;
 }
 
-static boolean
+static bool
 nouveau_scratch_more(struct nouveau_context *nv, unsigned min_size)
 {
-   boolean ret;
+   bool ret;
 
    ret = nouveau_scratch_next(nv, min_size);
    if (!ret)
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h
index de77f481da3..7e6a6cc804b 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.h
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.h
@@ -58,7 +58,7 @@ nouveau_copy_buffer(struct nouveau_context *,
                     struct nv04_resource *dst, unsigned dst_pos,
                     struct nv04_resource *src, unsigned src_pos, unsigned size);
 
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *,
                        struct nv04_resource *, unsigned domain);
 
@@ -66,20 +66,20 @@ void *
 nouveau_resource_map_offset(struct nouveau_context *, struct nv04_resource *,
                             uint32_t offset, uint32_t flags);
 
-static INLINE void
+static inline void
 nouveau_resource_unmap(struct nv04_resource *res)
 {
    /* no-op */
 }
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv04_resource(struct pipe_resource *resource)
 {
    return (struct nv04_resource *)resource;
 }
 
 /* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
-static INLINE boolean
+static inline bool
 nouveau_resource_mapped_by_gpu(struct pipe_resource *resource)
 {
    return nv04_resource(resource)->domain != 0;
@@ -93,7 +93,7 @@ struct pipe_resource *
 nouveau_user_buffer_create(struct pipe_screen *screen, void *ptr,
                            unsigned bytes, unsigned usage);
 
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *,
                            unsigned base, unsigned size);
 
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index c2ba0159afe..24deb7ee4c0 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -13,7 +13,7 @@ struct nouveau_context {
    struct nouveau_client *client;
    struct nouveau_pushbuf *pushbuf;
 
-   boolean vbo_dirty;
+   bool vbo_dirty;
 
    void (*copy_data)(struct nouveau_context *,
                      struct nouveau_bo *dst, unsigned, unsigned,
@@ -53,7 +53,7 @@ struct nouveau_context {
    } stats;
 };
 
-static INLINE struct nouveau_context *
+static inline struct nouveau_context *
 nouveau_context(struct pipe_context *pipe)
 {
    return (struct nouveau_context *)pipe;
@@ -69,7 +69,7 @@ nouveau_scratch_runout_release(struct nouveau_context *);
  * because we don't want to un-bo_ref each allocation every time. This is less
  * work, and we need the wrap index anyway for extreme situations.
  */
-static INLINE void
+static inline void
 nouveau_scratch_done(struct nouveau_context *nv)
 {
    nv->scratch.wrap = nv->scratch.id;
@@ -84,7 +84,7 @@ void *
 nouveau_scratch_get(struct nouveau_context *, unsigned size, uint64_t *gpu_addr,
                     struct nouveau_bo **);
 
-static INLINE void
+static inline void
 nouveau_context_destroy(struct nouveau_context *ctx)
 {
    int i;
@@ -96,7 +96,7 @@ nouveau_context_destroy(struct nouveau_context *ctx)
    FREE(ctx);
 }
 
-static INLINE  void
+static inline  void
 nouveau_context_update_frame_stats(struct nouveau_context *nv)
 {
    nv->stats.buf_cache_frame <<= 1;
@@ -104,7 +104,7 @@ nouveau_context_update_frame_stats(struct nouveau_context *nv)
       nv->stats.buf_cache_count = 0;
       nv->stats.buf_cache_frame |= 1;
       if ((nv->stats.buf_cache_frame & 0xf) == 0xf)
-         nv->screen->hint_buf_keep_sysmem_copy = TRUE;
+         nv->screen->hint_buf_keep_sysmem_copy = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 17a5174594d..abcdb479954 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -28,13 +28,13 @@
 #include <sched.h>
 #endif
 
-boolean
+bool
 nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
-                  boolean emit)
+                  bool emit)
 {
    *fence = CALLOC_STRUCT(nouveau_fence);
    if (!*fence)
-      return FALSE;
+      return false;
 
    (*fence)->screen = screen;
    (*fence)->ref = 1;
@@ -43,7 +43,7 @@ nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
    if (emit)
       nouveau_fence_emit(*fence);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -58,7 +58,7 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
    }
 }
 
-boolean
+bool
 nouveau_fence_work(struct nouveau_fence *fence,
                    void (*func)(void *), void *data)
 {
@@ -66,16 +66,16 @@ nouveau_fence_work(struct nouveau_fence *fence,
 
    if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
       func(data);
-      return TRUE;
+      return true;
    }
 
    work = CALLOC_STRUCT(nouveau_fence_work);
    if (!work)
-      return FALSE;
+      return false;
    work->func = func;
    work->data = data;
    LIST_ADD(&work->list, &fence->work);
-   return TRUE;
+   return true;
 }
 
 void
@@ -132,7 +132,7 @@ nouveau_fence_del(struct nouveau_fence *fence)
 }
 
 void
-nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
+nouveau_fence_update(struct nouveau_screen *screen, bool flushed)
 {
    struct nouveau_fence *fence;
    struct nouveau_fence *next = NULL;
@@ -167,21 +167,21 @@ nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
 
 #define NOUVEAU_FENCE_MAX_SPINS (1 << 31)
 
-boolean
+bool
 nouveau_fence_signalled(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
 
    if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-      return TRUE;
+      return true;
 
    if (fence->state >= NOUVEAU_FENCE_STATE_EMITTED)
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
    return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }
 
-boolean
+bool
 nouveau_fence_wait(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
@@ -195,16 +195,16 @@ nouveau_fence_wait(struct nouveau_fence *fence)
 
    if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED)
       if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel))
-         return FALSE;
+         return false;
 
    if (fence == screen->fence.current)
       nouveau_fence_next(screen);
 
    do {
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
       if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-         return TRUE;
+         return true;
       if (!spins)
          NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
       spins++;
@@ -218,7 +218,7 @@ nouveau_fence_wait(struct nouveau_fence *fence)
                 fence->sequence,
                 screen->fence.sequence_ack, screen->fence.sequence);
 
-   return FALSE;
+   return false;
 }
 
 void
@@ -229,5 +229,5 @@ nouveau_fence_next(struct nouveau_screen *screen)
 
    nouveau_fence_ref(NULL, &screen->fence.current);
 
-   nouveau_fence_new(screen, &screen->fence.current, FALSE);
+   nouveau_fence_new(screen, &screen->fence.current, false);
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 7bb132a5d15..a1587051b0f 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -29,15 +29,15 @@ struct nouveau_fence {
 void nouveau_fence_emit(struct nouveau_fence *);
 void nouveau_fence_del(struct nouveau_fence *);
 
-boolean nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
-                          boolean emit);
-boolean nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
-void    nouveau_fence_update(struct nouveau_screen *, boolean flushed);
-void    nouveau_fence_next(struct nouveau_screen *);
-boolean nouveau_fence_wait(struct nouveau_fence *);
-boolean nouveau_fence_signalled(struct nouveau_fence *);
-
-static INLINE void
+bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
+                       bool emit);
+bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
+void nouveau_fence_update(struct nouveau_screen *, bool flushed);
+void nouveau_fence_next(struct nouveau_screen *);
+bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_signalled(struct nouveau_fence *);
+
+static inline void
 nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
 {
    if (fence)
@@ -51,7 +51,7 @@ nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
    *ref = fence;
 }
 
-static INLINE struct nouveau_fence *
+static inline struct nouveau_fence *
 nouveau_fence(struct pipe_fence_handle *fence)
 {
    return (struct nouveau_fence *)fence;
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
index ff97aaa9af0..1538c7b6e57 100644
--- a/src/gallium/drivers/nouveau/nouveau_gldefs.h
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -1,7 +1,7 @@
 #ifndef __NOUVEAU_GLDEFS_H__
 #define __NOUVEAU_GLDEFS_H__
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_func(unsigned factor)
 {
 	switch (factor) {
@@ -40,7 +40,7 @@ nvgl_blend_func(unsigned factor)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_eqn(unsigned func)
 {
 	switch (func) {
@@ -59,7 +59,7 @@ nvgl_blend_eqn(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_logicop_func(unsigned func)
 {
 	switch (func) {
@@ -100,7 +100,7 @@ nvgl_logicop_func(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_comparison_op(unsigned op)
 {
 	switch (op) {
@@ -125,7 +125,7 @@ nvgl_comparison_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_polygon_mode(unsigned mode)
 {
 	switch (mode) {
@@ -140,7 +140,7 @@ nvgl_polygon_mode(unsigned mode)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_stencil_op(unsigned op)
 {
 	switch (op) {
@@ -165,7 +165,7 @@ nvgl_stencil_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_primitive(unsigned prim) {
 	switch (prim) {
 	case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/nouveau/nouveau_mm.c b/src/gallium/drivers/nouveau/nouveau_mm.c
index 9c454c56db0..43b3d99f48a 100644
--- a/src/gallium/drivers/nouveau/nouveau_mm.c
+++ b/src/gallium/drivers/nouveau/nouveau_mm.c
@@ -70,7 +70,7 @@ mm_slab_alloc(struct mm_slab *slab)
    return -1;
 }
 
-static INLINE void
+static inline void
 mm_slab_free(struct mm_slab *slab, int i)
 {
    assert(i < slab->count);
@@ -79,7 +79,7 @@ mm_slab_free(struct mm_slab *slab, int i)
    assert(slab->free <= slab->count);
 }
 
-static INLINE int
+static inline int
 mm_get_order(uint32_t size)
 {
    int s = __builtin_clz(size) ^ 31;
@@ -104,7 +104,7 @@ mm_bucket_by_size(struct nouveau_mman *cache, unsigned size)
 }
 
 /* size of bo allocation for slab with chunks of (1 << chunk_order) bytes */
-static INLINE uint32_t
+static inline uint32_t
 mm_default_slab_size(unsigned chunk_order)
 {
    static const int8_t slab_order[MM_MAX_ORDER - MM_MIN_ORDER + 1] =
@@ -263,7 +263,7 @@ nouveau_mm_create(struct nouveau_device *dev, uint32_t domain,
    return cache;
 }
 
-static INLINE void
+static inline void
 nouveau_mm_free_slabs(struct list_head *head)
 {
    struct mm_slab *slab, *next;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index c6e5074db19..b2290e7e784 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -68,17 +68,13 @@ nouveau_screen_fence_ref(struct pipe_screen *pscreen,
 }
 
 static boolean
-nouveau_screen_fence_signalled(struct pipe_screen *screen,
-                               struct pipe_fence_handle *pfence)
-{
-        return nouveau_fence_signalled(nouveau_fence(pfence));
-}
-
-static boolean
 nouveau_screen_fence_finish(struct pipe_screen *screen,
 			    struct pipe_fence_handle *pfence,
                             uint64_t timeout)
 {
+	if (!timeout)
+		return nouveau_fence_signalled(nouveau_fence(pfence));
+
 	return nouveau_fence_wait(nouveau_fence(pfence));
 }
 
@@ -115,7 +111,7 @@ nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
 }
 
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
@@ -127,11 +123,11 @@ nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
 		whandle->handle = bo->handle;
-		return TRUE;
+		return true;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
 		return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
 	} else {
-		return FALSE;
+		return false;
 	}
 }
 
@@ -203,7 +199,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 	pscreen->get_timestamp = nouveau_screen_get_timestamp;
 
 	pscreen->fence_reference = nouveau_screen_fence_ref;
-	pscreen->fence_signalled = nouveau_screen_fence_signalled;
 	pscreen->fence_finish = nouveau_screen_fence_finish;
 
 	util_format_s3tc_init();
@@ -214,7 +209,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 		PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
 		PIPE_BIND_CURSOR |
 		PIPE_BIND_SAMPLER_VIEW |
-		PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE |
+		PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
+                PIPE_BIND_COMPUTE_RESOURCE |
 		PIPE_BIND_GLOBAL;
 	screen->sysmem_bindings =
 		PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 30041b271c9..4fdde9fbf3d 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -49,7 +49,7 @@ struct nouveau_screen {
 
 	int64_t cpu_gpu_time_delta;
 
-	boolean hint_buf_keep_sysmem_copy;
+	bool hint_buf_keep_sysmem_copy;
 
 	unsigned vram_domain;
 
@@ -112,15 +112,15 @@ struct nouveau_screen {
 # define NOUVEAU_DRV_STAT_IFD(x)
 #endif
 
-static INLINE struct nouveau_screen *
+static inline struct nouveau_screen *
 nouveau_screen(struct pipe_screen *pscreen)
 {
 	return (struct nouveau_screen *)pscreen;
 }
 
-boolean nouveau_drm_screen_unref(struct nouveau_screen *screen);
+bool nouveau_drm_screen_unref(struct nouveau_screen *screen);
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h
index 4f8bd7bdf16..f38014091ba 100644
--- a/src/gallium/drivers/nouveau/nouveau_statebuf.h
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@@ -20,7 +20,7 @@ struct nouveau_statebuf_builder
 #define sb_data(sb, v) *(sb).p++ = (v)
 #endif
 
-static INLINE uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
+static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
 {
 	return (size << 18) | (subc << 13) | mthd;
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index d6330fa63a8..e414a534418 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -100,7 +100,7 @@ nouveau_vpe_fini(struct nouveau_decoder *dec) {
    dec->current = dec->future = dec->past = 8;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -125,7 +125,7 @@ nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -143,7 +143,7 @@ nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                           const struct pipe_mpeg12_macroblock *mb,
                           bool luma)
@@ -187,7 +187,7 @@ nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                      x | (y << NV17_MPEG_CMD_MB_COORDS_Y__SHIFT));
 }
 
-static INLINE unsigned int
+static inline unsigned int
 nouveau_vpe_mb_mv_flags(bool luma, int mv_h, int mv_v, bool forward, bool first, bool vert)
 {
    unsigned mc_header = 0;
@@ -228,7 +228,7 @@ static int div_up(int val, int mult) {
    return val / mult;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_mv(struct nouveau_decoder *dec, unsigned mc_header,
                    bool luma, bool frame, bool forward, bool vert,
                    int x, int y, const short motions[2],
@@ -296,16 +296,16 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
       case PIPE_MPEG12_MO_TYPE_DUAL_PRIME: {
          base = NV17_MPEG_CMD_CHROMA_MV_HEADER_COUNT_2;
          if (forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, TRUE,
-                              x, y2, mb->PMV[0][0], dec->past, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                              x, y, mb->PMV[0][0], dec->past, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, true,
+                              x, y2, mb->PMV[0][0], dec->past, false);
          }
          if (backward && forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, TRUE,
-                              x, y, mb->PMV[1][0], dec->future, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                              x, y2, mb->PMV[1][1], dec->future, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, true,
+                              x, y, mb->PMV[1][0], dec->future, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                              x, y2, mb->PMV[1][1], dec->future, false);
          } else assert(!backward);
          break;
       }
@@ -320,13 +320,13 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
          if (frame)
             base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
          if (forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                               dec->picture_structure != PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
+                              x, y, mb->PMV[0][0], dec->past, true);
          if (backward && forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, FALSE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, false,
                               dec->picture_structure == PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][1], dec->future, TRUE);
+                              x, y, mb->PMV[0][1], dec->future, true);
          else assert(!backward);
          break;
       }
@@ -341,11 +341,11 @@ mv1:
        base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
     /* frame 16x16 */
    if (forward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                         x, y, mb->PMV[0][0], dec->past, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                         x, y, mb->PMV[0][0], dec->past, true);
    if (backward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                         x, y, mb->PMV[0][1], dec->future, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                         x, y, mb->PMV[0][1], dec->future, true);
     return;
 
 mv2:
@@ -353,20 +353,20 @@ mv2:
    if (!frame)
       base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_MV_SPLIT_HALF_MB;
    if (forward) {
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_FORWARD,
-                        x, y, mb->PMV[0][0], dec->past, TRUE);
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+                        x, y, mb->PMV[0][0], dec->past, true);
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_FORWARD,
-                        x, y2, mb->PMV[1][0], dec->past, FALSE);
+                        x, y2, mb->PMV[1][0], dec->past, false);
    }
    if (backward) {
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_BACKWARD,
-                        x, y, mb->PMV[0][1], dec->future, TRUE);
+                        x, y, mb->PMV[0][1], dec->future, true);
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_BACKWARD,
-                        x, y2, mb->PMV[1][1], dec->future, FALSE);
+                        x, y2, mb->PMV[1][1], dec->future, false);
    }
 }
 
@@ -438,14 +438,14 @@ nouveau_decoder_decode_macroblock(struct pipe_video_codec *decoder,
    mb = (const struct pipe_mpeg12_macroblock *)pipe_mb;
    for (i = 0; i < num_macroblocks; ++i, mb++) {
       if (mb->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA) {
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       } else {
-         nouveau_vpe_mb_mv_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
+         nouveau_vpe_mb_mv_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
 
-         nouveau_vpe_mb_mv_header(dec, mb, FALSE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_mv_header(dec, mb, false);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       }
       if (dec->base.entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT)
          nouveau_vpe_mb_dct_blocks(dec, mb);
diff --git a/src/gallium/drivers/nouveau/nouveau_video.h b/src/gallium/drivers/nouveau/nouveau_video.h
index 08d48b371fd..fd1bd527deb 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_video.h
@@ -45,7 +45,7 @@ struct nouveau_decoder {
 #define NV31_VIDEO_BIND_CMD     NV31_MPEG_IMAGE_Y_OFFSET__LEN
 #define NV31_VIDEO_BIND_COUNT  (NV31_MPEG_IMAGE_Y_OFFSET__LEN + 1)
 
-static INLINE void
+static inline void
 nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
    dec->cmds[dec->ofs++] = data;
 }
@@ -54,33 +54,33 @@ nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
 #define NV31_MPEG(mthd) SUBC_MPEG(NV31_MPEG_##mthd)
 #define NV84_MPEG(mthd) SUBC_MPEG(NV84_MPEG_##mthd)
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd,
            struct nouveau_bo *bo, uint32_t offset,
 	   struct nouveau_bufctx *ctx, int bin, uint32_t rw)
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 279a1ce18ef..33e3bef3df3 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -135,22 +135,22 @@ struct comm {
 	uint32_t parse_endpos[0x10]; // 1c0
 };
 
-static INLINE uint32_t nouveau_vp3_video_align(uint32_t h)
+static inline uint32_t nouveau_vp3_video_align(uint32_t h)
 {
    return ((h+0x3f)&~0x3f);
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
 
-static INLINE uint64_t
+static inline uint64_t
 nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target)
 {
    uint64_t ret;
@@ -161,7 +161,7 @@ nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video
    return dec->ref_bo->offset + ret;
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
                           uint32_t *cbcr, uint32_t *cbcr2)
 {
@@ -182,7 +182,7 @@ nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_inter_sizes(struct nouveau_vp3_decoder *dec, uint32_t slice_count,
                         uint32_t *slice_size, uint32_t *bucket_size,
                         uint32_t *ring_size)
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 51effb1d8d2..389a229eb78 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -15,34 +15,34 @@
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN      64
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK (NOUVEAU_MIN_BUFFER_MAP_ALIGN - 1)
 
-static INLINE uint32_t
+static inline uint32_t
 PUSH_AVAIL(struct nouveau_pushbuf *push)
 {
    return push->end - push->cur;
 }
 
-static INLINE boolean
+static inline bool
 PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
 {
    if (PUSH_AVAIL(push) < size)
       return nouveau_pushbuf_space(push, size, 0, 0) == 0;
-   return TRUE;
+   return true;
 }
 
-static INLINE void
+static inline void
 PUSH_DATA(struct nouveau_pushbuf *push, uint32_t data)
 {
    *push->cur++ = data;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAp(struct nouveau_pushbuf *push, const void *data, uint32_t size)
 {
    memcpy(push->cur, data, size * 4);
    push->cur += size;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAf(struct nouveau_pushbuf *push, float f)
 {
    union { float f; uint32_t i; } u;
@@ -50,7 +50,7 @@ PUSH_DATAf(struct nouveau_pushbuf *push, float f)
    PUSH_DATA(push, u.i);
 }
 
-static INLINE void
+static inline void
 PUSH_KICK(struct nouveau_pushbuf *push)
 {
    nouveau_pushbuf_kick(push, push->channel);
@@ -60,7 +60,7 @@ PUSH_KICK(struct nouveau_pushbuf *push)
 #define NOUVEAU_RESOURCE_FLAG_LINEAR   (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define NOUVEAU_RESOURCE_FLAG_DRV_PRIV (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_screen_transfer_flags(unsigned pipe)
 {
 	uint32_t flags = 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
index 447f4b3b7ae..95468e580dd 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
@@ -1459,6 +1459,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NV40_3D_VTX_CACHE_INVALIDATE				0x00001714
 
+#define NV40_3D_VB_ELEMENT_BASE					0x0000173c
+
 #define NV30_3D_VTXFMT(i0)				       (0x00001740 + 0x4*(i0))
 #define NV30_3D_VTXFMT__ESIZE					0x00000004
 #define NV30_3D_VTXFMT__LEN					0x00000010
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index 83fd1fa38dd..118cac77277 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -32,7 +32,7 @@
 #include "nv30/nv30_context.h"
 #include "nv30/nv30_format.h"
 
-static INLINE uint32_t
+static inline uint32_t
 pack_rgba(enum pipe_format format, const float *rgba)
 {
    union util_color uc;
@@ -40,7 +40,7 @@ pack_rgba(enum pipe_format format, const float *rgba)
    return uc.ui[0];
 }
 
-static INLINE uint32_t
+static inline uint32_t
 pack_zeta(enum pipe_format format, double depth, unsigned stencil)
 {
    uint32_t zuint = (uint32_t)(depth * 4294967295.0);
@@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers,
    struct pipe_framebuffer_state *fb = &nv30->framebuffer;
    uint32_t colr = 0, zeta = 0, mode = 0;
 
-   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE))
+   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, true))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index 617b0887810..6e88ed725d6 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -45,7 +45,7 @@ nv30_context_kick_notify(struct nouveau_pushbuf *push)
    screen = &nv30->screen->base;
 
    nouveau_fence_next(screen);
-   nouveau_fence_update(screen, TRUE);
+   nouveau_fence_update(screen, true);
 
    if (push->bufctx) {
       struct nouveau_bufref *bref;
@@ -165,6 +165,12 @@ nv30_context_destroy(struct pipe_context *pipe)
    if (nv30->draw)
       draw_destroy(nv30->draw);
 
+   if (nv30->blit_vp)
+      nouveau_heap_free(&nv30->blit_vp);
+
+   if (nv30->blit_fp)
+      pipe_resource_reference(&nv30->blit_fp, NULL);
+
    if (nv30->screen->base.pushbuf->user_priv == &nv30->bufctx)
       nv30->screen->base.pushbuf->user_priv = NULL;
 
@@ -233,7 +239,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv)
 
    nv30->config.aniso = NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF;
 
-   if (debug_get_bool_option("NV30_SWTNL", FALSE))
+   if (debug_get_bool_option("NV30_SWTNL", false))
       nv30->draw_flags |= NV30_NEW_SWTNL;
 
    nv30->sample_mask = 0xffff;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 592cdbe24f9..d5c18bb62dc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -51,7 +51,8 @@ struct nv30_context {
       unsigned rt_enable;
       unsigned scissor_off;
       unsigned num_vtxelts;
-      boolean  prim_restart;
+      int index_bias;
+      bool prim_restart;
       struct nv30_fragprog *fragprog;
    } state;
 
@@ -114,17 +115,17 @@ struct nv30_context {
    uint32_t vbo_user;
    unsigned vbo_min_index;
    unsigned vbo_max_index;
-   boolean  vbo_push_hint;
+   bool vbo_push_hint;
 
    struct nouveau_heap  *blit_vp;
    struct pipe_resource *blit_fp;
 
    struct pipe_query *render_cond_query;
    unsigned render_cond_mode;
-   boolean render_cond_cond;
+   bool render_cond_cond;
 };
 
-static INLINE struct nv30_context *
+static inline struct nv30_context *
 nv30_context(struct pipe_context *pipe)
 {
    return (struct nv30_context *)pipe;
@@ -203,8 +204,8 @@ nv30_draw_init(struct pipe_context *pipe);
 void
 nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl);
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl);
 
 void
 nv30_state_release(struct nv30_context *nv30);
@@ -213,7 +214,7 @@ nv30_state_release(struct nv30_context *nv30);
 #define NV30_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV30_3D_VERTEX_BEGIN_END_##n
 
-static INLINE unsigned
+static inline unsigned
 nv30_prim_gl(unsigned prim)
 {
    switch (prim) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index c1665b7ad2f..098d6e499fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -52,7 +52,7 @@ struct nv30_render {
    uint32_t prim;
 };
 
-static INLINE struct nv30_render *
+static inline struct nv30_render *
 nv30_render(struct vbuf_render *render)
 {
    return (struct nv30_render *)render;
@@ -79,12 +79,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render,
                                      PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM,
                                      render->max_vertex_buffer_bytes);
       if (!r->buffer)
-         return FALSE;
+         return false;
 
       r->offset = 0;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void *
@@ -134,7 +134,7 @@ nv30_render_draw_elements(struct vbuf_render *render,
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -179,7 +179,7 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -221,7 +221,7 @@ static const struct {
    [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
 };
 
-static boolean
+static bool
 vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
 {
    struct nv30_screen *screen = r->nv30->screen;
@@ -245,7 +245,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    }
 
    if (emit == EMIT_OMIT)
-      return FALSE;
+      return false;
 
    draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
    format = draw_translate_vinfo_format(emit);
@@ -272,10 +272,10 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
       assert(sem == TGSI_SEMANTIC_TEXCOORD);
       *idx = 0x00001000 << (result - 8);
    }
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nv30_render_validate(struct nv30_context *nv30)
 {
    struct nv30_render *r = nv30_render(nv30->draw->render);
@@ -300,7 +300,7 @@ nv30_render_validate(struct nv30_context *nv30)
          }
 
          if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog))
-            return FALSE;
+            return false;
       }
    }
 
@@ -370,7 +370,7 @@ nv30_render_validate(struct nv30_context *nv30)
    }
 
    vinfo->size /= 4;
-   return TRUE;
+   return true;
 }
 
 void
@@ -519,6 +519,6 @@ nv30_draw_init(struct pipe_context *pipe)
    draw_set_rasterize_stage(draw, stage);
    draw_wide_line_threshold(draw, 10000000.f);
    draw_wide_point_threshold(draw, 10000000.f);
-   draw_wide_point_sprites(draw, TRUE);
+   draw_wide_point_sprites(draw, true);
    nv30->draw = draw;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.h b/src/gallium/drivers/nouveau/nv30/nv30_format.h
index 8bf4a37299f..fa1e922fb65 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_format.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_format.h
@@ -27,28 +27,28 @@ struct nv30_texfmt {
 };
 
 extern const struct nv30_format_info nv30_format_info_table[];
-static INLINE const struct nv30_format_info *
+static inline const struct nv30_format_info *
 nv30_format_info(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_info_table[format];
 }
 
 extern const struct nv30_format nv30_format_table[];
-static INLINE const struct nv30_format *
+static inline const struct nv30_format *
 nv30_format(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_table[format];
 }
 
 extern const struct nv30_vtxfmt nv30_vtxfmt_table[];
-static INLINE const struct nv30_vtxfmt *
+static inline const struct nv30_vtxfmt *
 nv30_vtxfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_vtxfmt_table[format];
 }
 
 extern const struct nv30_texfmt nv30_texfmt_table[];
-static INLINE const struct nv30_texfmt *
+static inline const struct nv30_texfmt *
 nv30_texfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_texfmt_table[format];
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index 7f227868f73..6de61bcc1c0 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -37,22 +37,26 @@ nv30_fragprog_upload(struct nv30_context *nv30)
    struct nouveau_context *nv = &nv30->base;
    struct nv30_fragprog *fp = nv30->fragprog.program;
    struct pipe_context *pipe = &nv30->base.pipe;
-   struct pipe_transfer *transfer;
-   uint32_t *map;
-   int i; (void)i;
 
-   if (unlikely(!fp->buffer)) {
+   if (unlikely(!fp->buffer))
       fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4);
-   }
 
-   map = pipe_buffer_map(pipe, fp->buffer, PIPE_TRANSFER_WRITE, &transfer);
 #ifndef PIPE_ARCH_BIG_ENDIAN
-   memcpy(map, fp->insn, fp->insn_len * 4);
+   pipe_buffer_write(pipe, fp->buffer, 0, fp->insn_len * 4, fp->insn);
 #else
-   for (i = 0; i < fp->insn_len; i++)
-      *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+   {
+      struct pipe_transfer *transfer;
+      uint32_t *map;
+      int i;
+
+      map = pipe_buffer_map(pipe, fp->buffer,
+                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                            &transfer);
+      for (i = 0; i < fp->insn_len; i++)
+         *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+      pipe_buffer_unmap(pipe, transfer);
+   }
 #endif
-   pipe_buffer_unmap(pipe, transfer);
 
    if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM)
       nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM);
@@ -64,7 +68,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload = FALSE;
+   bool upload = false;
    int i;
 
    if (!fp->translated) {
@@ -72,7 +76,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
       if (!fp->translated)
          return;
 
-      upload = TRUE;
+      upload = true;
    }
 
    /* update constants, also needs to be done on every fp switch as we
@@ -89,7 +93,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
          if (!memcmp(&fp->insn[off], &cbuf[idx], 4 * 4))
             continue;
          memcpy(&fp->insn[off], &cbuf[idx], 4 * 4);
-         upload = TRUE;
+         upload = true;
       }
    }
 
@@ -161,8 +165,15 @@ static void
 nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
 {
    struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_fragprog *fp = hwcso;
+
+   /* reset the bucftx so that we don't keep a dangling reference to the fp
+    * code
+    */
+   if (fp != nv30->state.fragprog)
+      PUSH_RESET(nv30->base.pushbuf, BUFCTX_FRAGPROG);
 
-   nv30->fragprog.program = hwcso;
+   nv30->fragprog.program = fp;
    nv30->dirty |= NV30_NEW_FRAGPROG;
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
index 1a4b8929c0f..c75b4b95fd8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -33,7 +33,7 @@
 #include "nv30/nv30_resource.h"
 #include "nv30/nv30_transfer.h"
 
-static INLINE unsigned
+static inline unsigned
 layer_offset(struct pipe_resource *pt, unsigned level, unsigned layer)
 {
    struct nv30_miptree *mt = nv30_miptree(pt);
@@ -54,7 +54,7 @@ nv30_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -78,13 +78,13 @@ struct nv30_transfer {
    unsigned nblocksy;
 };
 
-static INLINE struct nv30_transfer *
+static inline struct nv30_transfer *
 nv30_transfer(struct pipe_transfer *ptx)
 {
    return (struct nv30_transfer *)ptx;
 }
 
-static INLINE void
+static inline void
 define_rect(struct pipe_resource *pt, unsigned level, unsigned z,
             unsigned x, unsigned y, unsigned w, unsigned h,
             struct nv30_rect *rect)
@@ -242,8 +242,8 @@ nv30_miptree_transfer_map(struct pipe_context *pipe, struct pipe_resource *pt,
    tx->base.level = level;
    tx->base.usage = usage;
    tx->base.box = *box;
-   tx->base.stride = util_format_get_nblocksx(pt->format, box->width) *
-                     util_format_get_blocksize(pt->format);
+   tx->base.stride = align(util_format_get_nblocksx(pt->format, box->width) *
+                           util_format_get_blocksize(pt->format), 64);
    tx->base.layer_stride = util_format_get_nblocksy(pt->format, box->height) *
                            tx->base.stride;
 
@@ -372,7 +372,7 @@ nv30_miptree_create(struct pipe_screen *pscreen,
    }
 
    if (!mt->uniform_pitch)
-      mt->swizzled = TRUE;
+      mt->swizzled = true;
 
    size = 0;
    for (l = 0; l <= pt->last_level; l++) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
index e0734fa70d3..67ab0508c17 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_push.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -47,12 +47,12 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -62,7 +62,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -72,7 +72,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -199,7 +199,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv30->base.pushbuf;
    ctx.translate = nv30->vertex->translate;
@@ -241,7 +241,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
    } else {
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index 516ee83168e..3980be9579a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -98,7 +98,7 @@ struct nv30_query {
    uint64_t result;
 };
 
-static INLINE struct nv30_query *
+static inline struct nv30_query *
 nv30_query(struct pipe_query *pipe)
 {
    return (struct nv30_query *)pipe;
@@ -208,7 +208,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (ntfy1) {
       while (ntfy1[3] & 0xff000000) {
          if (!wait)
-            return FALSE;
+            return false;
       }
 
       switch (q->type) {
@@ -228,7 +228,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    }
 
    *res64 = q->result;
-   return TRUE;
+   return true;
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
index 38fac8af898..a98a6464de8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -42,12 +42,12 @@ nv30_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv30->vtxbuf[i].buffer)
             continue;
          if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
       }
 
       if (nv30->idxbuf.buffer &&
           nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
index 1981c8d9ab9..8dac7795c9d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
@@ -15,7 +15,7 @@ struct nv30_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv30_surface *
+static inline struct nv30_surface *
 nv30_surface(struct pipe_surface *ps)
 {
    return (struct nv30_surface *)ps;
@@ -32,13 +32,13 @@ struct nv30_miptree {
    struct nv30_miptree_level level[13];
    uint32_t uniform_pitch;
    uint32_t layer_size;
-   boolean swizzled;
+   bool swizzled;
    unsigned ms_mode;
    unsigned ms_x:1;
    unsigned ms_y:1;
 };
 
-static INLINE struct nv30_miptree *
+static inline struct nv30_miptree *
 nv30_miptree(struct pipe_resource *pt)
 {
    return (struct nv30_miptree *)pt;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2e38a1978ae..7aad26ba18b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -69,6 +69,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_ENDIAN_LITTLE;
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
       return 16;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
    case PIPE_CAP_MAX_VIEWPORTS:
       return 1;
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -96,6 +98,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 1;
+   /* nv35 capabilities */
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+      return eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS;
    /* nv4x capabilities */
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_NPOT_TEXTURES:
@@ -135,7 +140,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
    case PIPE_CAP_START_INSTANCE:
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
-   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
    case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
@@ -162,6 +166,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -313,12 +320,12 @@ nv30_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 4)
-      return FALSE;
+      return false;
    if (!(0x00000017 & (1 << sample_count)))
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings)) {
-      return FALSE;
+      return false;
    }
 
    /* transfers & shared are always supported */
@@ -656,6 +663,6 @@ nv30_screen_create(struct nouveau_device *dev)
 
    nouveau_pushbuf_kick(push, push->channel);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
    return pscreen;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.h b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
index 3f2e47fec99..7b17b88097c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
@@ -40,7 +40,7 @@ struct nv30_screen {
    struct nouveau_heap *vp_data_heap;
 };
 
-static INLINE struct nv30_screen *
+static inline struct nv30_screen *
 nv30_screen(struct pipe_screen *pscreen)
 {
    return (struct nv30_screen *)pscreen;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
index 708ba34c1e5..fd604c2266d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -211,6 +211,7 @@ static void *
 nv30_zsa_state_create(struct pipe_context *pipe,
                       const struct pipe_depth_stencil_alpha_state *cso)
 {
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
    struct nv30_zsa_stateobj *so;
 
    so = CALLOC_STRUCT(nv30_zsa_stateobj);
@@ -223,6 +224,13 @@ nv30_zsa_state_create(struct pipe_context *pipe,
    SB_DATA  (so, cso->depth.writemask);
    SB_DATA  (so, cso->depth.enabled);
 
+   if (eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS) {
+      SB_MTHD35(so, DEPTH_BOUNDS_TEST_ENABLE, 3);
+      SB_DATA  (so, cso->depth.bounds_test);
+      SB_DATA  (so, fui(cso->depth.bounds_min));
+      SB_DATA  (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_MTHD30(so, STENCIL_ENABLE(0), 3);
       SB_DATA  (so, 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h
index e27e16fae82..ed3b8103a00 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h
@@ -13,6 +13,8 @@
 #define SB_DATA(so, u)        (so)->data[(so)->size++] = (u)
 #define SB_MTHD30(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV30_3D_##mthd)
+#define SB_MTHD35(so, mthd, size)                                          \
+   SB_DATA((so), ((size) << 18) | (7 << 13) | NV35_3D_##mthd)
 #define SB_MTHD40(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV40_3D_##mthd)
 
@@ -30,7 +32,7 @@ struct nv30_rasterizer_stateobj {
 
 struct nv30_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
-   unsigned data[32];
+   unsigned data[36];
    unsigned size;
 };
 
@@ -80,7 +82,7 @@ struct nv30_vertprog {
    struct tgsi_shader_info info;
 
    struct draw_vertex_shader *draw;
-   boolean translated;
+   bool translated;
    unsigned enabled_ucps;
    uint16_t texcoord[10];
 
@@ -109,7 +111,7 @@ struct nv30_fragprog {
    struct tgsi_shader_info info;
 
    struct draw_fragment_shader *draw;
-   boolean translated;
+   bool translated;
 
    uint32_t *insn;
    unsigned insn_len;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
index a954dcce562..8957634f0fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -453,8 +453,8 @@ nv30_state_context_switch(struct nv30_context *nv30)
    nv30->base.pushbuf->user_priv = &nv30->bufctx;
 }
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl)
 {
    struct nouveau_screen *screen = &nv30->screen->base;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
@@ -494,7 +494,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
    nouveau_pushbuf_bufctx(push, bctx);
    if (nouveau_pushbuf_validate(push)) {
       nouveau_pushbuf_bufctx(push, NULL);
-      return FALSE;
+      return false;
    }
 
    /*XXX*/
@@ -528,7 +528,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
       }
    }
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_texture.c b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
index c3567217442..bfe21cceaa2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_texture.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
@@ -37,7 +37,7 @@
 #define NV40_WRAP(n) \
    case PIPE_TEX_WRAP_##n: ret = NV40_3D_TEX_WRAP_S_##n; break
 
-static INLINE unsigned
+static inline unsigned
 wrap_mode(unsigned pipe)
 {
    unsigned ret = NV30_3D_TEX_WRAP_S_REPEAT;
@@ -58,7 +58,7 @@ wrap_mode(unsigned pipe)
    return ret >> NV30_3D_TEX_WRAP_S__SHIFT;
 }
 
-static INLINE unsigned
+static inline unsigned
 filter_mode(const struct pipe_sampler_state *cso)
 {
    unsigned filter;
@@ -104,7 +104,7 @@ filter_mode(const struct pipe_sampler_state *cso)
    return filter;
 }
 
-static INLINE unsigned
+static inline unsigned
 compare_mode(const struct pipe_sampler_state *cso)
 {
    if (cso->compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE)
@@ -201,7 +201,7 @@ nv30_bind_sampler_states(struct pipe_context *pipe,
    }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 swizzle(const struct nv30_texfmt *fmt, unsigned cmp, unsigned swz)
 {
    uint32_t data = fmt->swz[swz].src << 8;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
index 99bc0994ac2..214da6568c3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -41,33 +41,33 @@
  * of different ways.
  */
 
-static INLINE boolean
+static inline bool
 nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
 {
    if (src->x1 - src->x0 != dst->x1 - dst->x0)
-      return TRUE;
+      return true;
    if (src->y1 - src->y0 != dst->y1 - dst->y0)
-      return TRUE;
-   return FALSE;
+      return true;
+   return false;
 }
 
-static INLINE boolean
+static inline bool
 nv30_transfer_blit(XFER_ARGS)
 {
    if (nv30->screen->eng3d->oclass < NV40_3D_CLASS)
-      return FALSE;
+      return false;
    if (dst->offset & 63 || dst->pitch & 63 || dst->d > 1)
-      return FALSE;
+      return false;
    if (dst->w < 2 || dst->h < 2)
-      return FALSE;
+      return false;
    if (dst->cpp > 4 || (dst->cpp == 1 && !dst->pitch))
-      return FALSE;
+      return false;
    if (src->cpp > 4)
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
-static INLINE struct nouveau_heap *
+static inline struct nouveau_heap *
 nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 {
    struct nouveau_heap *heap = nv30->screen->vp_exec_heap;
@@ -108,7 +108,7 @@ nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 }
 
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv30_transfer_rect_fragprog(struct nv30_context *nv30)
 {
    struct nv04_resource *fp = nv04_resource(nv30->blit_fp);
@@ -368,29 +368,29 @@ nv30_transfer_rect_blit(XFER_ARGS)
    PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
 }
 
-static boolean
+static bool
 nv30_transfer_sifm(XFER_ARGS)
 {
    if (!src->pitch || (src->w | src->h) > 1024 || src->w < 2 || src->h < 2)
-      return FALSE;
+      return false;
 
    if (src->d > 1 || dst->d > 1)
-      return FALSE;
+      return false;
 
    if (dst->offset & 63)
-      return FALSE;
+      return false;
 
    if (!dst->pitch) {
       if ((dst->w | dst->h) > 2048 || dst->w < 2 || dst->h < 2)
-         return FALSE;
+         return false;
    } else {
       if (dst->domain != NOUVEAU_BO_VRAM)
-         return FALSE;
+         return false;
       if (dst->pitch & 63)
-         return FALSE;
+         return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -481,14 +481,14 @@ nv30_transfer_rect_sifm(XFER_ARGS)
  * that name is still accurate on nv4x) error.
  */
 
-static boolean
+static bool
 nv30_transfer_m2mf(XFER_ARGS)
 {
    if (!src->pitch || !dst->pitch)
-      return FALSE;
+      return false;
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static void
@@ -540,12 +540,12 @@ nv30_transfer_rect_m2mf(XFER_ARGS)
    }
 }
 
-static boolean
+static bool
 nv30_transfer_cpu(XFER_ARGS)
 {
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static char *
@@ -554,7 +554,7 @@ linear_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
    return base + (y * rect->pitch) + (x * rect->cpp);
 }
 
-static INLINE unsigned
+static inline unsigned
 swizzle2d(unsigned v, unsigned s)
 {
    v = (v | (v << 8)) & 0x00ff00ff;
@@ -614,7 +614,7 @@ swizzle3d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
 
 typedef char *(*get_ptr_t)(struct nv30_rect *, char *, int, int, int);
 
-static INLINE get_ptr_t
+static inline get_ptr_t
 get_ptr(struct nv30_rect *rect)
 {
    if (rect->pitch)
@@ -653,7 +653,7 @@ nv30_transfer_rect(struct nv30_context *nv30, enum nv30_transfer_filter filter,
 {
    static const struct {
       char *name;
-      boolean (*possible)(XFER_ARGS);
+      bool (*possible)(XFER_ARGS);
       void (*execute)(XFER_ARGS);
    } *method, methods[] = {
       { "m2mf", nv30_transfer_m2mf, nv30_transfer_rect_m2mf },
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index d4e384b21d2..8494549e9b1 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -79,7 +79,7 @@ nv30_emit_vtxattr(struct nv30_context *nv30, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv30_vbuf_range(struct nv30_context *nv30, int vbi,
                 uint32_t *base, uint32_t *size)
 {
@@ -119,7 +119,7 @@ nv30_prevalidate_vbufs(struct nv30_context *nv30)
             } else {
                nouveau_buffer_migrate(&nv30->base, buf, NOUVEAU_BO_GART);
             }
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
          }
       }
    }
@@ -160,10 +160,10 @@ nv30_update_user_vbufs(struct nv30_context *nv30)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD,
                        0, NV30_3D_VTXBUF_DMA1);
    }
-   nv30->base.vbo_dirty = TRUE;
+   nv30->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv30_release_user_vbufs(struct nv30_context *nv30)
 {
    uint32_t vbo_user = nv30->vbo_user;
@@ -202,6 +202,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
       return;
 
    redefine = MAX2(vertex->num_elements, nv30->state.num_vtxelts);
+   if (redefine == 0)
+      return;
+
    BEGIN_NV04(push, NV30_3D(VTXFMT(0)), redefine);
 
    for (i = 0; i < vertex->num_elements; i++) {
@@ -221,7 +224,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
    for (i = 0; i < vertex->num_elements; i++) {
       struct nv04_resource *res;
       unsigned offset;
-      boolean user;
+      bool user;
 
       ve = &vertex->pipe[i];
       vb = &nv30->vtxbuf[ve->vertex_buffer_index];
@@ -254,14 +257,12 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
     struct translate_key transkey;
     unsigned i;
 
-    assert(num_elements);
-
     so = MALLOC(sizeof(*so) + sizeof(*so->element) * num_elements);
     if (!so)
         return NULL;
     memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
     so->num_elements = num_elements;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     transkey.nr_elements = 0;
     transkey.output_stride = 0;
@@ -284,7 +285,7 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
                 return NULL;
             }
             so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
 
         if (1) {
@@ -452,7 +453,7 @@ nv30_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
+nv30_draw_elements(struct nv30_context *nv30, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -461,13 +462,11 @@ nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    unsigned prim = nv30_prim_gl(mode);
 
-#if 0 /*XXX*/
-   if (index_bias != nv30->state.index_bias) {
-      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_BASE), 1);
+   if (eng3d->oclass >= NV40_3D_CLASS && index_bias != nv30->state.index_bias) {
+      BEGIN_NV04(push, NV40_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, index_bias);
       nv30->state.index_bias = index_bias;
    }
-#endif
 
    if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 &&
        nv30->idxbuf.buffer) {
@@ -564,7 +563,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
       nv30_update_user_vbufs(nv30);
 
-   nv30_state_validate(nv30, ~0, TRUE);
+   nv30_state_validate(nv30, ~0, true);
    if (nv30->draw_flags) {
       nv30_render_vbo(pipe, info);
       return;
@@ -578,17 +577,17 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv30->vtxbuf[i].buffer)
          continue;
       if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 
    if (!nv30->base.vbo_dirty && nv30->idxbuf.buffer &&
        nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv30->base.vbo_dirty = TRUE;
+      nv30->base.vbo_dirty = true;
 
    if (nv30->base.vbo_dirty) {
       BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1);
       PUSH_DATA (push, 0);
-      nv30->base.vbo_dirty = FALSE;
+      nv30->base.vbo_dirty = false;
    }
 
    if (!info->indexed) {
@@ -596,7 +595,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
                        info->mode, info->start, info->count,
                        info->instance_count);
    } else {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv30->state.prim_restart) {
          if (info->primitive_restart) {
@@ -605,7 +604,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -617,7 +616,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv30_draw_elements(nv30, shorten,
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
index 4d4145d10b5..ee0a6280d7a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -48,7 +48,7 @@ nv30_vertprog_destroy(struct nv30_vertprog *vp)
    vp->consts = NULL;
    vp->nr_consts = 0;
 
-   vp->translated = FALSE;
+   vp->translated = false;
 }
 
 void
@@ -58,8 +58,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_vertprog *vp = nv30->vertprog.program;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload_code = FALSE;
-   boolean upload_data = FALSE;
+   bool upload_code = false;
+   bool upload_data = false;
    unsigned i;
 
    if (nv30->dirty & NV30_NEW_FRAGPROG) {
@@ -125,7 +125,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
+      upload_code = true;
    }
 
    if (vp->nr_consts && !vp->data) {
@@ -166,8 +166,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
-      upload_data = TRUE;
+      upload_code = true;
+      upload_data = true;
    }
 
    if (vp->nr_consts) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
index 5cee5df60ce..2324b517c44 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
@@ -19,34 +19,34 @@
 #define NV40_3D_PRIM_RESTART_ENABLE 0x1dac
 #define NV40_3D_PRIM_RESTART_INDEX  0x1db0
 
-static INLINE void
+static inline void
 PUSH_RELOC(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t offset,
       uint32_t flags, uint32_t vor, uint32_t tor)
 {
    nouveau_pushbuf_reloc(push, bo, offset, flags, vor, tor);
 }
 
-static INLINE struct nouveau_bufctx *
+static inline struct nouveau_bufctx *
 bufctx(struct nouveau_pushbuf *push)
 {
    struct nouveau_bufctx **pctx = push->user_priv;
    return *pctx;
 }
 
-static INLINE void
+static inline void
 PUSH_RESET(struct nouveau_pushbuf *push, int bin)
 {
    nouveau_bufctx_reset(bufctx(push), bin);
 }
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, int bin,
      struct nouveau_bo *bo, uint32_t access)
 {
    nouveau_bufctx_refn(bufctx(push), bin, bo, access);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t offset, uint32_t access)
 {
@@ -55,7 +55,7 @@ PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    PUSH_DATA(push, bo->offset + offset);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t access, uint32_t vor, uint32_t tor)
 {
@@ -67,7 +67,7 @@ PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, tor);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t data, uint32_t access,
       uint32_t vor, uint32_t tor)
@@ -80,7 +80,7 @@ PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, data | tor);
 }
 
-static INLINE struct nouveau_bufref *
+static inline struct nouveau_bufref *
 PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
      struct nouveau_bo *bo, uint32_t data, uint32_t access,
      uint32_t vor, uint32_t tor)
@@ -99,7 +99,7 @@ PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    return bref;
 }
 
-static INLINE void
+static inline void
 PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
            struct nv04_resource *r, uint32_t data, uint32_t access,
            uint32_t vor, uint32_t tor)
@@ -108,14 +108,14 @@ PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
              r->domain | access, vor, tor)->priv = r;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd);
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 9ef16965f39..e68d23e5587 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -44,7 +44,7 @@ struct nvfx_fpc {
    struct util_dynarray label_relocs;
 };
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 temp(struct nvfx_fpc *fpc)
 {
    int idx = __builtin_ctzll(~fpc->r_temps);
@@ -60,7 +60,7 @@ temp(struct nvfx_fpc *fpc)
    return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
-static INLINE void
+static inline void
 release_temps(struct nvfx_fpc *fpc)
 {
    fpc->r_temps &= ~fpc->r_temps_discard;
@@ -373,7 +373,7 @@ nv40_fp_brk(struct nvfx_fpc *fpc)
    hw[3] = 0;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
    struct nvfx_src src;
@@ -415,7 +415,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    switch (fdst->Register.File) {
    case TGSI_FILE_OUTPUT:
@@ -430,7 +430,7 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    }
 }
 
-static INLINE int
+static inline int
 tgsi_mask(uint tgsi)
 {
    int mask = 0;
@@ -442,7 +442,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
             const struct tgsi_full_instruction *finst)
 {
@@ -455,7 +455,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
    int i;
 
    if (finst->Instruction.Opcode == TGSI_OPCODE_END)
-      return TRUE;
+      return true;
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       const struct tgsi_full_src_register *fsrc;
@@ -525,7 +525,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
@@ -868,12 +868,12 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
 
         default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
 out:
    release_temps(fpc);
-   return TRUE;
+   return true;
 nv3x_cflow:
    {
       static int warned = 0;
@@ -887,7 +887,7 @@ nv3x_cflow:
    goto out;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
                                const struct tgsi_full_declaration *fdec)
 {
@@ -917,17 +917,17 @@ nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
    case TGSI_SEMANTIC_GENERIC:
    case TGSI_SEMANTIC_PCOORD:
       /* will be assigned to remaining TC slots later */
-      return TRUE;
+      return true;
    default:
       assert(0);
-      return FALSE;
+      return false;
    }
 
    fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
                              const struct tgsi_full_declaration *fdec)
 {
@@ -954,16 +954,16 @@ nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
             }
             hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
             fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-            return TRUE;
+            return true;
          }
       }
-      return FALSE;
+      return false;
    default:
-      return TRUE;
+      return true;
    }
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
             const struct tgsi_full_declaration *fdec)
 {
@@ -984,20 +984,20 @@ nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
       }
       if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
          NOUVEAU_ERR("bad rcol index\n");
-         return FALSE;
+         return false;
       }
       break;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
    fpc->r_temps |= (1ULL << hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
 {
    struct tgsi_parse_context p;
@@ -1081,17 +1081,17 @@ nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
       fpc->r_temps_discard = 0ULL;
    }
 
-   return TRUE;
+   return true;
 
 out_err:
    FREE(fpc->r_temp);
    fpc->r_temp = NULL;
 
    tgsi_parse_free(&p);
-   return FALSE;
+   return false;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false)
 
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
@@ -1100,7 +1100,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
    struct nvfx_fpc *fpc = NULL;
    struct util_dynarray insns;
 
-   fp->translated = FALSE;
+   fp->translated = false;
    fp->point_sprite_control = 0;
    fp->vp_or = 0;
 
@@ -1182,7 +1182,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
       debug_printf("\n");
    }
 
-   fp->translated = TRUE;
+   fp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
index 9538a793d7e..e66d8af7620 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -448,8 +448,8 @@ struct nvfx_insn
 	struct nvfx_src src[3];
 };
 
-static INLINE struct nvfx_insn
-nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
+static inline struct nvfx_insn
+nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
 	struct nvfx_insn insn = {
 		.op = op,
@@ -468,7 +468,7 @@ nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask
 	return insn;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 nvfx_reg(int type, int index)
 {
 	struct nvfx_reg temp = {
@@ -478,7 +478,7 @@ nvfx_reg(int type, int index)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src(struct nvfx_reg reg)
 {
 	struct nvfx_src temp = {
@@ -491,7 +491,7 @@ nvfx_src(struct nvfx_reg reg)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 {
 	struct nvfx_src dst = src;
@@ -503,14 +503,14 @@ nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 	return dst;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_neg(struct nvfx_src src)
 {
 	src.negate = !src.negate;
 	return src;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_abs(struct nvfx_src src)
 {
 	src.abs = 1;
@@ -529,7 +529,7 @@ struct nv30_vertprog;
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp);
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 1ce0589be71..5757eb1fb16 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -416,7 +416,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
    struct nvfx_reg dst;
 
@@ -455,7 +455,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
             unsigned idx, const struct tgsi_full_instruction *finst)
 {
@@ -466,7 +466,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    struct nvfx_insn insn;
    struct nvfx_relocation reloc;
    struct nvfx_loop_entry loop;
-   boolean sat = FALSE;
+   bool sat = false;
    int mask;
    int ai = -1, ci = -1, ii = -1;
    int i;
@@ -524,25 +524,25 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       if(src[i].reg.type < 0)
-         return FALSE;
+         return false;
    }
 
    if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
       finst->Instruction.Opcode != TGSI_OPCODE_ARL)
-      return FALSE;
+      return false;
 
    final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
    if(finst->Instruction.Saturate) {
       assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
       if (vpc->is_nv4x)
-         sat = TRUE;
+         sat = true;
       else
       if(dst.type != NVFXSR_TEMP)
          dst = temp(vpc);
@@ -793,7 +793,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       break;
    default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
    if(finst->Instruction.Saturate && !vpc->is_nv4x) {
@@ -804,10 +804,10 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    }
 
    release_temps(vpc);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
                                 const struct tgsi_full_declaration *fdec)
 {
@@ -825,7 +825,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
       vpc->r_result[idx] = temp(vpc);
       vpc->r_temps_discard = 0;
       vpc->cvtx_idx = idx;
-      return TRUE;
+      return true;
    case TGSI_SEMANTIC_COLOR:
       if (fdec->Semantic.Index == 0) {
          hw = NVFX_VP(INST_DEST_COL0);
@@ -834,7 +834,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_COL1);
       } else {
          NOUVEAU_ERR("bad colour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_BCOLOR:
@@ -845,7 +845,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_BFC1);
       } else {
          NOUVEAU_ERR("bad bcolour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_FOG:
@@ -868,22 +868,22 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
 
       if (i == num_texcoords) {
          vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-         return TRUE;
+         return true;
       }
       break;
    case TGSI_SEMANTIC_EDGEFLAG:
       vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-      return TRUE;
+      return true;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
 {
    struct tgsi_parse_context p;
@@ -924,7 +924,7 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
             break;
          case TGSI_FILE_OUTPUT:
             if (!nvfx_vertprog_parse_decl_output(vpc, fdec))
-               return FALSE;
+               return false;
             break;
          default:
             break;
@@ -961,12 +961,12 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
    }
 
    vpc->r_temps_discard = 0;
-   return TRUE;
+   return true;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", false)
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 {
    struct tgsi_parse_context parse;
@@ -975,13 +975,13 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
    struct util_dynarray insns;
    int i, ucps;
 
-   vp->translated = FALSE;
+   vp->translated = false;
    vp->nr_insns = 0;
    vp->nr_consts = 0;
 
    vpc = CALLOC_STRUCT(nvfx_vpc);
    if (!vpc)
-      return FALSE;
+      return false;
    vpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
    vpc->vp   = vp;
    vpc->pipe = vp->pipe;
@@ -990,7 +990,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 
    if (!nvfx_vertprog_prepare(vpc)) {
       FREE(vpc);
-      return FALSE;
+      return false;
    }
 
    /* Redirect post-transform vertex position to a temp if user clip
@@ -1108,7 +1108,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
       debug_printf("\n");
    }
 
-   vp->translated = TRUE;
+   vp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
index 756c4c11bf6..0ccec568d3a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -37,7 +37,7 @@ nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
 #define NV50_BLIT_TEXTURE_2D_ARRAY  5
 #define NV50_BLIT_MAX_TEXTURE_TYPES 6
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_texture_type(enum pipe_texture_target target)
 {
    switch (target) {
@@ -52,7 +52,7 @@ nv50_blit_texture_type(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -67,7 +67,7 @@ nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE enum pipe_texture_target
+static inline enum pipe_texture_target
 nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -81,7 +81,7 @@ nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_filter(const struct pipe_blit_info *info)
 {
    if (info->dst.resource->nr_samples < info->src.resource->nr_samples)
@@ -102,7 +102,7 @@ nv50_blit_get_filter(const struct pipe_blit_info *info)
 /* Since shaders cannot export stencil, we cannot copy stencil values when
  * rendering to ZETA, so we attach the ZS surface to a colour render target.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_blit_zeta_to_colour_format(enum pipe_format format)
 {
    switch (format) {
@@ -127,7 +127,7 @@ nv50_blit_zeta_to_colour_format(enum pipe_format format)
 }
 
 
-static INLINE uint16_t
+static inline uint16_t
 nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
 {
    const unsigned mask = info->mask;
@@ -162,7 +162,7 @@ nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
    return color_mask;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 {
    uint32_t mask = 0;
@@ -191,8 +191,8 @@ nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 # define nv50_format_table nvc0_format_table
 #endif
 
-/* return TRUE for formats that can be converted among each other by NVC0_2D */
-static INLINE boolean
+/* return true for formats that can be converted among each other by NVC0_2D */
+static inline bool
 nv50_2d_dst_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -201,7 +201,7 @@ nv50_2d_dst_format_faithful(enum pipe_format format)
    uint8_t id = nv50_format_table[format].rt;
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
-static INLINE boolean
+static inline bool
 nv50_2d_src_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -211,7 +211,7 @@ nv50_2d_src_format_faithful(enum pipe_format format)
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static inline bool
 nv50_2d_format_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
@@ -219,7 +219,7 @@ nv50_2d_format_supported(enum pipe_format format)
       (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static inline bool
 nv50_2d_dst_format_ops_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 5b5d3912c20..f8d46db7c67 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -64,12 +64,12 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv50->vtxbuf[i].buffer)
             continue;
          if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
       }
 
       if (nv50->idxbuf.buffer &&
           nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
 
       for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
          uint32_t valid = nv50->constbuf_valid[s];
@@ -87,7 +87,7 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nv50->cb_dirty = TRUE;
+               nv50->cb_dirty = true;
          }
       }
    }
@@ -100,9 +100,9 @@ nv50_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
    }
 }
 
@@ -310,7 +310,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv)
    nv50->base.invalidate_resource_storage = nv50_invalidate_resource_storage;
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_context_init_vdec(&nv50->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -351,7 +351,7 @@ out_err:
 }
 
 void
-nv50_bufctx_fence(struct nouveau_bufctx *bufctx, boolean on_flush)
+nv50_bufctx_fence(struct nouveau_bufctx *bufctx, bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 1f123ef7e92..ce12e714774 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -91,7 +91,7 @@
 
 struct nv50_blitctx;
 
-boolean nv50_blitctx_create(struct nv50_context *);
+bool nv50_blitctx_create(struct nv50_context *);
 
 struct nv50_context {
    struct nouveau_context base;
@@ -102,7 +102,7 @@ struct nv50_context {
    struct nouveau_bufctx *bufctx;
 
    uint32_t dirty;
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct nv50_graph_state state;
 
@@ -152,26 +152,26 @@ struct nv50_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   bool vbo_push_hint;
 
    uint32_t rt_array_mode;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
 };
 
-static INLINE struct nv50_context *
+static inline struct nv50_context *
 nv50_context(struct pipe_context *pipe)
 {
    return (struct nv50_context *)pipe;
 }
 
 /* return index used in nv50_context arrays for a specific shader type */
-static INLINE unsigned
+static inline unsigned
 nv50_context_shader_stage(unsigned pipe)
 {
    switch (pipe) {
@@ -188,7 +188,7 @@ nv50_context_shader_stage(unsigned pipe)
 /* nv50_context.c */
 struct pipe_context *nv50_create(struct pipe_screen *, void *);
 
-void nv50_bufctx_fence(struct nouveau_bufctx *, boolean on_flush);
+void nv50_bufctx_fence(struct nouveau_bufctx *, bool on_flush);
 
 void nv50_default_kick_notify(struct nouveau_pushbuf *);
 
@@ -202,7 +202,7 @@ void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nva0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *,
-                                unsigned index, boolean seralize);
+                                unsigned index, bool seralize);
 
 #define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -221,8 +221,8 @@ extern void nv50_init_state_functions(struct nv50_context *);
 
 /* nv50_state_validate.c */
 /* @words: check for space before emitting relocs */
-extern boolean nv50_state_validate(struct nv50_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nv50_state_validate(struct nv50_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 0f86ba1de0d..49a93bf1d91 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -44,7 +44,7 @@
  */
 #define U_V   PIPE_BIND_VERTEX_BUFFER
 #define U_T   PIPE_BIND_SAMPLER_VIEW
-#define U_I   PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE
+#define U_I   PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE
 #define U_TR  PIPE_BIND_RENDER_TARGET | U_T
 #define U_IR  U_TR | U_I
 #define U_TB  PIPE_BIND_BLENDABLE | U_TR
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index f15d8f3ecb6..92d49e49ff2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -30,7 +30,7 @@
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d)
+                                 bool is_3d)
 {
    uint32_t tile_mode = 0x000;
 
@@ -59,13 +59,13 @@ nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
 }
 
 static uint32_t
-nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny * 2, nz, is_3d);
 }
 
 static uint32_t
-nv50_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nv50_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
    uint32_t tile_flags;
@@ -184,7 +184,7 @@ nv50_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -204,7 +204,7 @@ const struct u_resource_vtbl nv50_miptree_vtbl =
    u_default_transfer_inline_write  /* transfer_inline_write */
 };
 
-static INLINE boolean
+static inline bool
 nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -228,12 +228,12 @@ nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 {
    struct pipe_resource *pt = &mt->base.base;
@@ -241,12 +241,12 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
    unsigned h = pt->height0;
 
    if (util_format_is_depth_or_stencil(pt->format))
-      return FALSE;
+      return false;
 
    if ((pt->last_level > 0) || (pt->depth0 > 1) || (pt->array_size > 1))
-      return FALSE;
+      return false;
    if (mt->ms_x | mt->ms_y)
-      return FALSE;
+      return false;
 
    mt->level[0].pitch = align(pt->width0 * blocksize, pitch_align);
 
@@ -256,7 +256,7 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 
    mt->total_size = mt->level[0].pitch * h;
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -335,7 +335,7 @@ nv50_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
@@ -438,7 +438,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
 
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index aaca4c550d9..02dc3677259 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -25,7 +25,7 @@
 
 #include "codegen/nv50_ir_driver.h"
 
-static INLINE unsigned
+static inline unsigned
 bitcount4(const uint32_t val)
 {
    static const uint8_t cnt[16]
@@ -104,7 +104,7 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
          prog->vp.bfc[info->out[i].si] = i;
          break;
       case TGSI_SEMANTIC_LAYER:
-         prog->gp.has_layer = TRUE;
+         prog->gp.has_layer = true;
          prog->gp.layerid = n;
          break;
       case TGSI_SEMANTIC_VIEWPORT_INDEX:
@@ -316,7 +316,7 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
    return so;
 }
 
-boolean
+bool
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -325,7 +325,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -410,7 +410,7 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 {
    struct nouveau_heap *heap;
@@ -423,7 +423,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
    default:
       assert(!"invalid program type");
-      return FALSE;
+      return false;
    }
 
    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
@@ -440,7 +440,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
    }
    prog->code_base = prog->mem->start;
@@ -448,10 +448,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
       nouveau_heap_free(&prog->mem);
-      return FALSE;
+      return false;
    }
    if (ret > 0)
-      nv50->state.new_tls_space = TRUE;
+      nv50->state.new_tls_space = true;
 
    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
@@ -463,7 +463,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
    PUSH_DATA (nv50->base.pushbuf, 0);
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index fe6bd6025be..5d3ff5644d2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -53,7 +53,7 @@ struct nv50_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
+   bool translated;
 
    uint32_t *code;
    unsigned code_size;
@@ -104,8 +104,8 @@ struct nv50_program {
    struct nv50_stream_output_state *so;
 };
 
-boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
-boolean nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
 
 #endif /* __NV50_PROG_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index a3a397c52c1..f31eaa0e314 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -23,13 +23,13 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
    uint32_t instance_id;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -39,7 +39,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -49,7 +49,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -179,7 +179,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -212,7 +212,7 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    unsigned i, index_size;
    unsigned inst_count = info->instance_count;
    unsigned vert_count = info->count;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
@@ -258,12 +258,12 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
             NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
             return;
          }
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 81f7474e36b..f4adbf8c653 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -48,20 +48,21 @@ struct nv50_query {
    uint32_t base;
    uint32_t offset; /* base + i * 32 */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
+   int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
 };
 
 #define NV50_QUERY_ALLOC_SPACE 256
 
-static INLINE struct nv50_query *
+static inline struct nv50_query *
 nv50_query(struct pipe_query *pipe)
 {
    return (struct nv50_query *)pipe;
 }
 
-static boolean
+static bool
 nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
 {
    struct nv50_screen *screen = nv50->screen;
@@ -80,17 +81,17 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
    if (size) {
       q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nv50_query_allocate(nv50, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -153,8 +154,8 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nv50_query *q = nv50_query(pq);
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
       q->offset += 32;
@@ -166,7 +167,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -175,11 +176,16 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
-      PUSH_SPACE(push, 4);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 1);
+      q->nesting = nv50->screen->num_occlusion_queries_active++;
+      if (q->nesting) {
+         nv50_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 4);
+         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 1);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0x10, 0x06805002);
@@ -223,9 +229,11 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       nv50_query_get(push, q, 0, 0x0100f002);
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 0);
+      if (--nv50->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 0);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0, 0x06805002);
@@ -261,7 +269,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NV50_QUERY_STATE_READY;
       break;
    default:
@@ -273,7 +281,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nv50_query_update(struct nv50_query *q)
 {
    if (q->is64bit) {
@@ -293,7 +301,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
    uint32_t *res32 = (uint32_t *)result;
-   boolean *res8 = (boolean *)result;
+   uint8_t *res8 = (uint8_t *)result;
    uint64_t *data64 = (uint64_t *)q->data;
    int i;
 
@@ -307,19 +315,19 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             q->state = NV50_QUERY_STATE_FLUSHED;
             PUSH_KICK(nv50->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
-         return FALSE;
+         return false;
    }
    q->state = NV50_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1];
+      res64[0] = q->data[1] - q->data[5];
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
@@ -338,7 +346,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -347,10 +355,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res32[0] = q->data[1];
       break;
    default:
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -377,7 +385,7 @@ nv50_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -391,13 +399,12 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NV50_3D_COND_MODE_EQUAL :
                             NV50_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
          if (likely(!condition)) {
-            /* XXX: Placeholder, handle nesting here if available */
-            if (unlikely(false))
+            if (unlikely(q->nesting))
                cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
                              NV50_3D_COND_MODE_ALWAYS;
             else
@@ -461,7 +468,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nva0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean serialize)
+                           unsigned index, bool serialize)
 {
    struct nv50_so_target *targ = nv50_so_target(ptarg);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index f7ee1354a92..a46e622c597 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -35,7 +35,7 @@ nv50_screen_init_resource_functions(struct pipe_screen *pscreen);
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d);
+                                 bool is_3d);
 
 struct nv50_miptree_level {
    uint32_t offset;
@@ -50,13 +50,13 @@ struct nv50_miptree {
    struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS];
    uint32_t total_size;
    uint32_t layer_stride;
-   boolean layout_3d; /* TRUE if layer count varies with mip level */
+   bool layout_3d; /* true if layer count varies with mip level */
    uint8_t ms_x;      /* log2 of number of samples in x/y dimension */
    uint8_t ms_y;
    uint8_t ms_mode;
 };
 
-static INLINE struct nv50_miptree *
+static inline struct nv50_miptree *
 nv50_miptree(struct pipe_resource *pt)
 {
    return (struct nv50_miptree *)pt;
@@ -70,7 +70,7 @@ nv50_miptree(struct pipe_resource *pt)
 
 /* Internal functions:
  */
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align);
 
 struct pipe_resource *
@@ -98,13 +98,13 @@ struct nv50_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv50_surface *
+static inline struct nv50_surface *
 nv50_surface(struct pipe_surface *ps)
 {
    return (struct nv50_surface *)ps;
 }
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_zs_to_s_format(enum pipe_format format)
 {
    switch (format) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 6583a353578..30e6e042fbf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -51,19 +51,19 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
    if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128)
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    switch (format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (nv50_screen(pscreen)->tesla->oclass < NVA0_3D_CLASS)
-         return FALSE;
+         return false;
       break;
    default:
       break;
@@ -176,6 +176,9 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -210,6 +213,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -286,7 +290,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       /* The chip could handle more sampler views than samplers */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-      return MIN2(32, PIPE_MAX_SAMPLERS);
+      return MIN2(16, PIPE_MAX_SAMPLERS);
    case PIPE_SHADER_CAP_DOUBLES:
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
@@ -454,7 +458,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    BEGIN_NV04(push, NV50_3D(UNK1400_LANES), 1);
    PUSH_DATA (push, 0xf);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       BEGIN_NV04(push, NV50_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x18);
    }
@@ -734,7 +738,7 @@ nv50_screen_create(struct nouveau_device *dev)
    nv50_screen_init_resource_functions(pscreen);
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_screen_init_vdec(&screen->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -890,7 +894,7 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index 881051b1862..ce51f0fc254 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -32,14 +32,14 @@ struct nv50_graph_state {
    uint32_t semantic_color;
    uint32_t semantic_psize;
    int32_t index_bias;
-   boolean uniform_buffer_bound[3];
-   boolean prim_restart;
-   boolean point_sprite;
-   boolean rt_serialize;
-   boolean flushed;
-   boolean rasterizer_discard;
+   bool uniform_buffer_bound[3];
+   bool prim_restart;
+   bool point_sprite;
+   bool rt_serialize;
+   bool flushed;
+   bool rasterizer_discard;
    uint8_t tls_required;
-   boolean new_tls_space;
+   bool new_tls_space;
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
    uint8_t num_textures[3];
@@ -54,6 +54,8 @@ struct nv50_screen {
    struct nv50_context *cur_ctx;
    struct nv50_graph_state save_state;
 
+   int num_occlusion_queries_active;
+
    struct nouveau_bo *code;
    struct nouveau_bo *uniforms;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
@@ -95,19 +97,19 @@ struct nv50_screen {
    struct nouveau_object *m2mf;
 };
 
-static INLINE struct nv50_screen *
+static inline struct nv50_screen *
 nv50_screen(struct pipe_screen *screen)
 {
    return (struct nv50_screen *)screen;
 }
 
-boolean nv50_blitter_create(struct nv50_screen *);
+bool nv50_blitter_create(struct nv50_screen *);
 void nv50_blitter_destroy(struct nv50_screen *);
 
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
-static INLINE void
+static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nv50_screen *screen = nv50_screen(res->base.screen);
@@ -119,7 +121,7 @@ nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nv50_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -142,21 +144,21 @@ struct nv50_format {
 
 extern const struct nv50_format nv50_format_table[];
 
-static INLINE void
+static inline void
 nv50_screen_tic_unlock(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_unlock(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -165,7 +167,7 @@ nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index c698782d8bd..b033ce5c6dc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -60,7 +60,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                continue;
             }
             if (!nv50->state.uniform_buffer_bound[s]) {
-               nv50->state.uniform_buffer_bound[s] = TRUE;
+               nv50->state.uniform_buffer_bound[s] = true;
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
             }
@@ -99,33 +99,35 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
 
                BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+
+               nv50->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (i << 8) | p | 0);
             }
             if (i == 0)
-               nv50->state.uniform_buffer_bound[s] = FALSE;
+               nv50->state.uniform_buffer_bound[s] = false;
          }
       }
    }
 }
 
-static boolean
+static bool
 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
    if (!prog->translated) {
       prog->translated = nv50_program_translate(
          prog, nv50->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    } else
    if (prog->mem)
-      return TRUE;
+      return true;
 
    return nv50_program_upload_code(nv50, prog);
 }
 
-static INLINE void
+static inline void
 nv50_program_update_context_state(struct nv50_context *nv50,
                                   struct nv50_program *prog, int stage)
 {
@@ -136,7 +138,7 @@ nv50_program_update_context_state(struct nv50_context *nv50,
          nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
       if (!nv50->state.tls_required || nv50->state.new_tls_space)
          BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
-      nv50->state.new_tls_space = FALSE;
+      nv50->state.new_tls_space = false;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
@@ -243,11 +245,11 @@ nv50_sprite_coords_validate(struct nv50_context *nv50)
          for (i = 0; i < 8; ++i)
             PUSH_DATA(push, 0);
 
-         nv50->state.point_sprite = FALSE;
+         nv50->state.point_sprite = false;
       }
       return;
    } else {
-      nv50->state.point_sprite = TRUE;
+      nv50->state.point_sprite = true;
    }
 
    memset(pntc, 0, sizeof(pntc));
@@ -646,7 +648,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
             nv50_query_pushbuf_submit(push, targ->pq, 0x4);
          } else {
             PUSH_DATA(push, 0);
-            targ->clean = FALSE;
+            targ->clean = false;
          }
       } else {
          const unsigned limit = targ->pipe.buffer_size /
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d4d41af3c61..9505a0b4085 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -62,7 +62,7 @@
  *     in advance to maintain elegant separate shader objects.)
  */
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_colormask(unsigned mask)
 {
    uint32_t ret = 0;
@@ -82,7 +82,7 @@ nv50_colormask(unsigned mask)
 #define NV50_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -116,7 +116,7 @@ nv50_blend_state_create(struct pipe_context *pipe,
 {
    struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
    int i;
-   boolean emit_common_func = cso->rt[0].blend_enable;
+   bool emit_common_func = cso->rt[0].blend_enable;
    uint32_t ms;
 
    if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
@@ -137,11 +137,11 @@ nv50_blend_state_create(struct pipe_context *pipe,
       for (i = 0; i < 8; ++i) {
          SB_DATA(so, cso->rt[i].blend_enable);
          if (cso->rt[i].blend_enable)
-            emit_common_func = TRUE;
+            emit_common_func = true;
       }
 
       if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
-         emit_common_func = FALSE;
+         emit_common_func = false;
 
          for (i = 0; i < 8; ++i) {
             if (!cso->rt[i].blend_enable)
@@ -373,6 +373,16 @@ nv50_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, 0);
    }
 
+   SB_BEGIN_3D(so, DEPTH_BOUNDS_EN, 1);
+   if (cso->depth.bounds_test) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   } else {
+      SB_DATA    (so, 0);
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
@@ -439,7 +449,7 @@ nv50_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
 #define NV50_TSC_WRAP_CASE(n) \
     case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_tsc_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -572,7 +582,7 @@ nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
                                unsigned nr, void **hwcso)
 {
@@ -650,7 +660,7 @@ nv50_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
@@ -808,7 +818,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
 
-   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nv50->constbuf[s][i].user) {
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -1041,7 +1051,7 @@ nv50_so_target_create(struct pipe_context *pipe,
    } else {
       targ->pq = NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1075,32 +1085,32 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
-   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+   bool serialize = true;
+   const bool can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nv50->so_target[i] != targets[i];
-      const boolean append = (offsets[i] == (unsigned)-1);
+      const bool changed = nv50->so_target[i] != targets[i];
+      const bool append = (offsets[i] == (unsigned)-1);
       if (!changed && append)
          continue;
       nv50->so_targets_dirty |= 1 << i;
 
       if (can_resume && changed && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
 
       if (targets[i] && !append)
-         nv50_so_target(targets[i])->clean = TRUE;
+         nv50_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nv50->so_target[i], targets[i]);
    }
    for (; i < nv50->num_so_targets; ++i) {
       if (can_resume && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
       pipe_so_target_reference(&nv50->so_target[i], NULL);
       nv50->so_targets_dirty |= 1 << i;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 116bf4bba7c..985603df5fa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -2,7 +2,7 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_defs.xml.h"
 
-static INLINE void
+static inline void
 nv50_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(i)), 4);
@@ -82,7 +82,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -111,7 +111,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -275,7 +275,7 @@ nv50_validate_viewport(struct nv50_context *nv50)
    nv50->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nv50_check_program_ucps(struct nv50_context *nv50,
                         struct nv50_program *vp, uint8_t mask)
 {
@@ -296,6 +296,23 @@ nv50_check_program_ucps(struct nv50_context *nv50,
    nv50_fp_linkage_validate(nv50);
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nv50_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nv50_validate_derived_2(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   if (nv50->zsa && nv50->zsa->pipe.alpha.enabled &&
+       nv50->framebuffer.nr_cbufs == 0) {
+      nv50_fb_set_null_rt(push, 0);
+      BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
 static void
 nv50_validate_clip(struct nv50_context *nv50)
 {
@@ -456,6 +473,7 @@ static struct state_validate {
     { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
     { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
     { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
@@ -468,7 +486,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -490,19 +508,19 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
       nv50->dirty &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
-         nv50->state.rt_serialize = FALSE;
+         nv50->state.rt_serialize = false;
          BEGIN_NV04(nv50->base.pushbuf, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
          PUSH_DATA (nv50->base.pushbuf, 0);
       }
 
-      nv50_bufctx_fence(nv50->bufctx_3d, FALSE);
+      nv50_bufctx_fence(nv50->bufctx_3d, false);
    }
    nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
    ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
 
    if (unlikely(nv50->state.flushed)) {
-      nv50->state.flushed = FALSE;
-      nv50_bufctx_fence(nv50->bufctx_3d, TRUE);
+      nv50->state.flushed = false;
+      nv50_bufctx_fence(nv50->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index eea5327b6cb..cf75d1eb11b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -31,7 +31,7 @@ struct nv50_rasterizer_stateobj {
 struct nv50_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[29];
+   uint32_t state[34];
 };
 
 struct nv50_constbuf {
@@ -41,7 +41,7 @@ struct nv50_constbuf {
    } u;
    uint32_t size; /* max 65536 */
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nv50_vertex_element {
@@ -56,7 +56,7 @@ struct nv50_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean need_conversion;
+   bool need_conversion;
    unsigned vertex_size;
    unsigned packet_vertex_limit;
    struct nv50_vertex_element element[0];
@@ -66,10 +66,10 @@ struct nv50_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
-static INLINE struct nv50_so_target *
+static inline struct nv50_so_target *
 nv50_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nv50_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
index 99548cbdb42..e0793bb6ec4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
@@ -9,7 +9,7 @@ struct nv50_tsc_entry {
    uint32_t tsc[8];
 };
 
-static INLINE struct nv50_tsc_entry *
+static inline struct nv50_tsc_entry *
 nv50_tsc_entry(void *hwcso)
 {
    return (struct nv50_tsc_entry *)hwcso;
@@ -21,7 +21,7 @@ struct nv50_tic_entry {
    uint32_t tic[8];
 };
 
-static INLINE struct nv50_tic_entry *
+static inline struct nv50_tic_entry *
 nv50_tic_entry(struct pipe_sampler_view *view)
 {
    return (struct nv50_tic_entry *)view;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index dc9852d4e47..b1ae01692cb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -49,8 +49,8 @@
 #define NOUVEAU_DRIVER 0x50
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
-nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+static inline uint8_t
+nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nv50_format_table[format].rt;
 
@@ -76,7 +76,7 @@ nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 static int
 nv50_2d_texture_set(struct nouveau_pushbuf *push, int dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -153,7 +153,7 @@ nv50_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
@@ -196,7 +196,7 @@ nv50_resource_copy_region(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -658,7 +658,7 @@ nv50_blitter_make_vp(struct nv50_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    blit->vp.code = (uint32_t *)code; /* const_cast */
    blit->vp.code_size = sizeof(code);
    blit->vp.max_gpr = 4;
@@ -687,24 +687,24 @@ nv50_blitter_make_fp(struct pipe_context *pipe,
 
    const unsigned target = nv50_blit_get_tgsi_texture_target(ptarg);
 
-   boolean tex_rgbaz = FALSE;
-   boolean tex_s = FALSE;
-   boolean cvt_un8 = FALSE;
+   bool tex_rgbaz = false;
+   bool tex_s = false;
+   bool cvt_un8 = false;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_Z24X8 &&
        mode != NV50_BLIT_MODE_X8Z24)
-      tex_s = TRUE;
+      tex_s = true;
 
    if (mode != NV50_BLIT_MODE_X24S8 &&
        mode != NV50_BLIT_MODE_S8X24 &&
        mode != NV50_BLIT_MODE_XS)
-      tex_rgbaz = TRUE;
+      tex_rgbaz = true;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_ZS &&
        mode != NV50_BLIT_MODE_XS)
-      cvt_un8 = TRUE;
+      cvt_un8 = true;
 
    ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    if (!ureg)
@@ -1271,7 +1271,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1410,7 +1410,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
          PUSH_DATA (push, srcy >> 32);
       }
    }
-   nv50_bufctx_fence(nv50->bufctx, FALSE);
+   nv50_bufctx_fence(nv50->bufctx, false);
 
    nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D);
 
@@ -1432,71 +1432,82 @@ static void
 nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
-   boolean eng3d = FALSE;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   bool eng3d = FALSE;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format) ||
           !nv50_2d_src_format_faithful(info->src.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (util_format_is_intensity(info->src.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
                eng3d = !nv50_2d_format_supported(info->src.format);
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: can't make this work with eng2d anymore */
    if ((info->src.resource->nr_samples | 1) !=
        (info->dst.resource->nr_samples | 1))
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: find correct src coordinate adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
+
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
 
    if (!eng3d)
       nv50_blit_eng2d(nv50, info);
    else
       nv50_blit_3d(nv50, info);
+
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 1);
+   }
 }
 
 static void
@@ -1505,13 +1516,13 @@ nv50_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nv50_blitter_create(struct nv50_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nv50_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
 
    pipe_mutex_init(screen->blitter->mutex);
@@ -1519,7 +1530,7 @@ nv50_blitter_create(struct nv50_screen *screen)
    nv50_blitter_make_vp(screen->blitter);
    nv50_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1542,20 +1553,20 @@ nv50_blitter_destroy(struct nv50_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nv50_blitctx_create(struct nv50_context *nv50)
 {
    nv50->blit = CALLOC_STRUCT(nv50_blitctx);
    if (!nv50->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nv50->blit->nv50 = nv50;
 
    nv50->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index d69c8d6ff0d..fc6374d1b1b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -31,8 +31,8 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+static inline uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -71,6 +71,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
                          uint32_t flags,
                          enum pipe_texture_target target)
 {
+   const uint32_t class_3d = nouveau_context(pipe)->screen->class_3d;
    const struct util_format_description *desc;
    uint64_t addr;
    uint32_t *tic;
@@ -78,7 +79,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt = nv50_miptree(texture);
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -192,7 +193,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
       break;
    default:
       NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -201,11 +202,17 @@ nv50_create_texture_view(struct pipe_context *pipe,
 
    tic[5] = (mt->base.base.height0 << mt->ms_y) & 0xffff;
    tic[5] |= depth << 16;
-   tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   if (class_3d > NV50_3D_CLASS)
+      tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   else
+      tic[5] |= view->pipe.u.tex.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
 
    tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000; /* sampling points */
 
-   tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   if (class_3d > NV50_3D_CLASS)
+      tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   else
+      tic[7] = 0;
 
    if (unlikely(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS)))
       if (mt->base.base.last_level)
@@ -214,13 +221,13 @@ nv50_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nv50_validate_tic(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nouveau_bo *txc = nv50->screen->txc;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_textures[s]; ++i) {
@@ -263,7 +270,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
          BEGIN_NI04(push, NV50_2D(SIFC_DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
@@ -309,7 +316,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
 
 void nv50_validate_textures(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tic(nv50, 0);
    need_flush |= nv50_validate_tic(nv50, 1);
@@ -321,12 +328,12 @@ void nv50_validate_textures(struct nv50_context *nv50)
    }
 }
 
-static boolean
+static bool
 nv50_validate_tsc(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_samplers[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_samplers[s]; ++i) {
@@ -343,7 +350,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
          nv50_sifc_linear_u8(&nv50->base, nv50->screen->txc,
                              65536 + tsc->id * 32,
                              NOUVEAU_BO_VRAM, 32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nv50->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -361,7 +368,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
 
 void nv50_validate_samplers(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tsc(nv50, 0);
    need_flush |= nv50_validate_tsc(nv50, 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 1fd33b8aa59..6324726acec 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -58,7 +58,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -89,7 +89,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nv50_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         so->element[i].state |= i;
 
@@ -188,7 +188,7 @@ nv50_emit_vtxattr(struct nv50_context *nv50, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv50_user_vbuf_range(struct nv50_context *nv50, unsigned vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -229,7 +229,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50,
          BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
                       NOUVEAU_BO_RD, bo);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
 static void
@@ -275,10 +275,10 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv50_release_user_vbufs(struct nv50_context *nv50)
 {
    if (nv50->vbo_user) {
@@ -316,7 +316,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
          struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer);
          if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
             break;
          }
       }
@@ -382,6 +382,11 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
       if (nv50->vbo_user & (1 << b)) {
          address = addrs[b] + ve->pipe.src_offset;
          limit = addrs[b] + limits[b];
+      } else
+      if (!vb->buffer) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+         PUSH_DATA (push, 0);
+         continue;
       } else {
          struct nv04_resource *buf = nv04_resource(vb->buffer);
          if (!(refd & (1 << b))) {
@@ -418,7 +423,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -585,7 +590,7 @@ nv50_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
+nv50_draw_elements(struct nv50_context *nv50, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -746,9 +751,9 @@ nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
    struct nv50_screen *screen = chan->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
-   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, TRUE);
+   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, true);
 }
 
 void
@@ -801,7 +806,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = TRUE;
+            nv50->cb_dirty = true;
       }
    }
 
@@ -809,7 +814,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->cb_dirty) {
       BEGIN_NV04(push, NV50_3D(CODE_CB_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->cb_dirty = FALSE;
+      nv50->cb_dirty = false;
    }
 
    if (nv50->vbo_fifo) {
@@ -830,21 +835,21 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv50->vtxbuf[i].buffer)
          continue;
       if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
    }
 
    if (!nv50->base.vbo_dirty && nv50->idxbuf.buffer &&
        nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv50->base.vbo_dirty = TRUE;
+      nv50->base.vbo_dirty = true;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->base.vbo_dirty = FALSE;
+      nv50->base.vbo_dirty = false;
    }
 
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv50->state.prim_restart) {
          if (info->primitive_restart) {
@@ -853,7 +858,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -865,7 +870,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv50_draw_elements(nv50, shorten,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index e8578c8be6f..76f1b41ea70 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -16,14 +16,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
                          struct nv04_resource *res, unsigned flags)
 {
@@ -39,7 +39,7 @@ nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nv50_add_bufctx_resident(bctx, NV50_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -61,39 +61,39 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
 
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_L(int subc, int mthd)
 {
    return 0x00030000 | (subc << 13) | mthd;
 }
 
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nv50.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -102,7 +102,7 @@ BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NV50_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -112,7 +112,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 }
 
 /* long, non-incremental, nv50-only */
-static INLINE void
+static inline void
 BEGIN_NL50(struct nouveau_pushbuf *push, int subc, int mthd, uint32_t size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.h b/src/gallium/drivers/nouveau/nv50/nv84_video.h
index 2edba389dbf..09773c12974 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video.h
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.h
@@ -102,12 +102,12 @@ struct nv84_decoder {
    uint8_t mpeg12_non_intra_matrix[64];
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
index f3480b2e00e..8b121477a37 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
@@ -221,7 +221,7 @@ nv84_decoder_vp_h264(struct nv84_decoder *dec,
    PUSH_KICK (push);
 }
 
-static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
+static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
    int16_t ret = val * quant / 16;
    if (mpeg1 && ret) {
       if (ret > 0)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 56fc83d3679..47bd123621b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -121,51 +121,51 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
    return 0;
 }
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0)
 {
    struct nvc0_program *prog = nvc0->compprog;
 
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
    if (unlikely(!prog->code_size))
-      return FALSE;
+      return false;
 
    if (likely(prog->code_size)) {
       if (nvc0_program_upload_code(nvc0, prog)) {
          struct nouveau_pushbuf *push = nvc0->base.pushbuf;
          BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
          PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE);
-         return TRUE;
+         return true;
       }
    }
-   return FALSE;
+   return false;
 }
 
-static boolean
+static bool
 nvc0_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
 
    /* TODO: textures, samplers, surfaces, global memory buffers */
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
index 9a1a71760d7..168a6d1bee2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
@@ -4,7 +4,7 @@
 #include "nv50/nv50_defs.xml.h"
 #include "nvc0/nvc0_compute.xml.h"
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0);
 
 #endif /* NVC0_COMPUTE_H */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index a35c3f66142..84f8db6a8ac 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -63,12 +63,12 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nvc0->vtxbuf[i].buffer)
             continue;
          if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nvc0->base.vbo_dirty = TRUE;
+            nvc0->base.vbo_dirty = true;
       }
 
       if (nvc0->idxbuf.buffer &&
           nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
 
       for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
          uint32_t valid = nvc0->constbuf_valid[s];
@@ -86,7 +86,7 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nvc0->cb_dirty = TRUE;
+               nvc0->cb_dirty = true;
          }
       }
    }
@@ -164,9 +164,9 @@ nvc0_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
       NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
    }
 }
@@ -378,7 +378,7 @@ out_err:
 
 void
 nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx,
-                  boolean on_flush)
+                  bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index a8d7593b398..f4499423a10 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -54,6 +54,7 @@
 #define NVC0_NEW_IDXBUF       (1 << 22)
 #define NVC0_NEW_SURFACES     (1 << 23)
 #define NVC0_NEW_MIN_SAMPLES  (1 << 24)
+#define NVC0_NEW_TESSFACTOR   (1 << 25)
 
 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
@@ -93,7 +94,7 @@
 
 struct nvc0_blitctx;
 
-boolean nvc0_blitctx_create(struct nvc0_context *);
+bool nvc0_blitctx_create(struct nvc0_context *);
 void nvc0_blitctx_destroy(struct nvc0_context *);
 
 struct nvc0_context {
@@ -130,7 +131,7 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
@@ -164,14 +165,17 @@ struct nvc0_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   float default_tess_outer[4];
+   float default_tess_inner[2];
+
+   bool vbo_push_hint;
 
    uint8_t tfbbuf_dirty;
    struct pipe_stream_output_target *tfbbuf[4];
    unsigned num_tfbbufs;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
@@ -184,19 +188,19 @@ struct nvc0_context {
    struct util_dynarray global_residents;
 };
 
-static INLINE struct nvc0_context *
+static inline struct nvc0_context *
 nvc0_context(struct pipe_context *pipe)
 {
    return (struct nvc0_context *)pipe;
 }
 
-static INLINE unsigned
+static inline unsigned
 nvc0_shader_stage(unsigned pipe)
 {
    switch (pipe) {
    case PIPE_SHADER_VERTEX: return 0;
-/* case PIPE_SHADER_TESSELLATION_CONTROL: return 1; */
-/* case PIPE_SHADER_TESSELLATION_EVALUATION: return 2; */
+   case PIPE_SHADER_TESS_CTRL: return 1;
+   case PIPE_SHADER_TESS_EVAL: return 2;
    case PIPE_SHADER_GEOMETRY: return 3;
    case PIPE_SHADER_FRAGMENT: return 4;
    case PIPE_SHADER_COMPUTE: return 5;
@@ -210,15 +214,15 @@ nvc0_shader_stage(unsigned pipe)
 /* nvc0_context.c */
 struct pipe_context *nvc0_create(struct pipe_screen *, void *);
 void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *,
-                       boolean on_flush);
+                       bool on_flush);
 void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 
 /* nvc0_draw.c */
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
 
 /* nvc0_program.c */
-boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
-boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
 uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
@@ -231,7 +235,7 @@ void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nvc0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *, unsigned i,
-                                boolean *serialize);
+                                bool *serialize);
 
 #define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -250,8 +254,8 @@ extern void nvc0_init_state_functions(struct nvc0_context *);
 /* nvc0_state_validate.c */
 void nvc0_validate_global_residents(struct nvc0_context *,
                                     struct nouveau_bufctx *, int bin);
-extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nvc0_surface.c */
 extern void nvc0_clear(struct pipe_context *, unsigned buffers,
@@ -260,7 +264,7 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers,
 extern void nvc0_init_surface_functions(struct nvc0_context *);
 
 /* nvc0_tex.c */
-boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s);
+bool nve4_validate_tsc(struct nvc0_context *nvc0, int s);
 void nvc0_validate_textures(struct nvc0_context *);
 void nvc0_validate_samplers(struct nvc0_context *);
 void nve4_set_tex_handles(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 3875bbf4ca4..15991c3d2bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -29,13 +29,13 @@
 #include "nvc0/nvc0_resource.h"
 
 static uint32_t
-nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d);
 }
 
 static uint32_t
-nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
 
@@ -133,7 +133,7 @@ nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
    return tile_flags;
 }
 
-static INLINE boolean
+static inline bool
 nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -157,9 +157,9 @@ nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -250,7 +250,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
@@ -325,7 +325,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
 }
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nvc0_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index e1f5a8c4416..507a2507fe3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -31,24 +31,25 @@
  * 124 scalar varying values.
  */
 static uint32_t
-nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_input_address(unsigned sn, unsigned si)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:   return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:       return 0x060;
    case TGSI_SEMANTIC_LAYER:        return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:        return 0x06c;
    case TGSI_SEMANTIC_POSITION:     return 0x070;
-   case TGSI_SEMANTIC_GENERIC:      return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:          return 0x2e8;
    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
-   case NV50_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
@@ -60,20 +61,21 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
 }
 
 static uint32_t
-nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_output_address(unsigned sn, unsigned si)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:        return 0x060;
    case TGSI_SEMANTIC_LAYER:         return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:         return 0x06c;
    case TGSI_SEMANTIC_POSITION:      return 0x070;
-   case TGSI_SEMANTIC_GENERIC:       return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:           return 0x2e8;
    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE:  return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
    case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
@@ -95,7 +97,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          info->in[i].mask = 0x1;
          info->in[i].slot[0] =
-            nvc0_shader_input_address(info->in[i].sn, 0, 0) / 4;
+            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
          continue;
       default:
          break;
@@ -111,18 +113,11 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numInputs; ++i) {
-      offset = nvc0_shader_input_address(info->in[i].sn,
-                                         info->in[i].si, ubase);
-      if (info->in[i].patch && offset >= 0x20)
-         offset = 0x20 + info->in[i].si * 0x10;
-
-      if (info->in[i].sn == NV50_SEMANTIC_TESSCOORD)
-         info->in[i].mask &= 3;
+      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
 
       for (c = 0; c < 4; ++c)
          info->in[i].slot[c] = (offset + c * 0x4) / 4;
@@ -157,15 +152,11 @@ nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numOutputs; ++i) {
-      offset = nvc0_shader_output_address(info->out[i].sn,
-                                          info->out[i].si, ubase);
-      if (info->out[i].patch && offset >= 0x20)
-         offset = 0x20 + info->out[i].si * 0x10;
+      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
 
       for (c = 0; c < 4; ++c)
          info->out[i].slot[c] = (offset + c * 0x4) / 4;
@@ -193,7 +184,7 @@ nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    return ret;
 }
 
-static INLINE void
+static inline void
 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
 {
    uint8_t min = (vp->hdr[4] >> 12) & 0xff;
@@ -216,12 +207,8 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
          continue;
       for (c = 0; c < 4; ++c) {
          a = info->in[i].slot[c];
-         if (info->in[i].mask & (1 << c)) {
-            if (info->in[i].sn != NV50_SEMANTIC_TESSCOORD)
-               vp->hdr[5 + a / 32] |= 1 << (a % 32);
-            else
-               nvc0_vtgp_hdr_update_oread(vp, info->in[i].slot[c]);
-         }
+         if (info->in[i].mask & (1 << c))
+            vp->hdr[5 + a / 32] |= 1 << (a % 32);
       }
    }
 
@@ -250,6 +237,14 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          vp->hdr[10] |= 1 << 31;
          break;
+      case TGSI_SEMANTIC_TESSCOORD:
+         /* We don't have the mask, nor the slots populated. While this could
+          * be achieved, the vast majority of the time if either of the coords
+          * are read, then both will be read.
+          */
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);
+         break;
       default:
          break;
       }
@@ -277,7 +272,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    return nvc0_vtgp_gen_header(vp, info);
 }
 
-#if defined(PIPE_SHADER_HULL) || defined(PIPE_SHADER_DOMAIN)
 static void
 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
 {
@@ -305,14 +299,13 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 
    switch (info->prop.tp.partitioning) {
-   case PIPE_TESS_PART_INTEGER:
-   case PIPE_TESS_PART_POW2:
+   case PIPE_TESS_SPACING_EQUAL:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
       break;
-   case PIPE_TESS_PART_FRACT_ODD:
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
       break;
-   case PIPE_TESS_PART_FRACT_EVEN:
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
       break;
    default:
@@ -320,9 +313,7 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    }
 }
-#endif
 
-#ifdef PIPE_SHADER_HULL
 static int
 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 {
@@ -346,9 +337,7 @@ nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
-#ifdef PIPE_SHADER_DOMAIN
 static int
 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 {
@@ -365,7 +354,6 @@ nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
 static int
 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
@@ -523,7 +511,7 @@ nvc0_program_dump(struct nvc0_program *prog)
 }
 #endif
 
-boolean
+bool
 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -531,7 +519,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -598,16 +586,12 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    case PIPE_SHADER_VERTEX:
       ret = nvc0_vp_gen_header(prog, info);
       break;
-#ifdef PIPE_SHADER_HULL
-   case PIPE_SHADER_HULL:
+   case PIPE_SHADER_TESS_CTRL:
       ret = nvc0_tcp_gen_header(prog, info);
       break;
-#endif
-#ifdef PIPE_SHADER_DOMAIN
-   case PIPE_SHADER_DOMAIN:
+   case PIPE_SHADER_TESS_EVAL:
       ret = nvc0_tep_gen_header(prog, info);
       break;
-#endif
    case PIPE_SHADER_GEOMETRY:
       ret = nvc0_gp_gen_header(prog, info);
       break;
@@ -630,7 +614,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       assert(info->bin.tlsSpace < (1 << 24));
       prog->hdr[0] |= 1 << 26;
       prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    /* TODO: factor 2 only needed where joinat/precont is used,
     *       and we only have to count non-uniform branches
@@ -638,7 +622,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    /*
    if ((info->maxCFDepth * 2) > 16) {
       prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    */
    if (info->io.globalAccess)
@@ -655,11 +639,11 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    struct nvc0_screen *screen = nvc0->screen;
-   const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE;
+   const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    int ret;
    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    uint32_t lib_pos = screen->lib_code->start;
@@ -694,7 +678,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
       IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
    }
@@ -729,7 +713,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
 
 #ifdef DEBUG
-   if (debug_get_bool_option("NV50_PROG_DEBUG", FALSE))
+   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
       nvc0_program_dump(prog);
 #endif
 
@@ -746,7 +730,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
    PUSH_DATA (nvc0->base.pushbuf, 0x1011);
 
-   return TRUE;
+   return true;
 }
 
 /* Upload code for builtin functions like integer division emulation. */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 3fd9d21b4c4..390e0c7a4f0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -21,8 +21,8 @@ struct nvc0_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
-   boolean need_tls;
+   bool translated;
+   bool need_tls;
    uint8_t num_gprs;
 
    uint32_t *code;
@@ -41,7 +41,7 @@ struct nvc0_program {
       uint8_t clip_enable; /* mask of defined clip planes */
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
-      boolean need_vertex_id;
+      bool need_vertex_id;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index aea6cbda02d..f7b85a8e931 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -44,7 +44,7 @@ struct nvc0_query {
    uint32_t base;
    uint32_t offset; /* base + i * rotate */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
    uint8_t rotate;
    int nesting; /* only used for occlusion queries */
    union {
@@ -62,13 +62,13 @@ static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
 static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
                                        struct nvc0_query *, void *, boolean);
 
-static INLINE struct nvc0_query *
+static inline struct nvc0_query *
 nvc0_query(struct pipe_query *pipe)
 {
    return (struct nvc0_query *)pipe;
 }
 
-static boolean
+static bool
 nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
 {
    struct nvc0_screen *screen = nvc0->screen;
@@ -87,17 +87,17 @@ nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
    if (size) {
       q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nvc0_query_allocate(nvc0, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -126,17 +126,17 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
       space = NVC0_QUERY_ALLOC_SPACE;
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 512;
       break;
    case PIPE_QUERY_SO_STATISTICS:
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 64;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       q->index = index;
       space = 32;
       break;
@@ -257,11 +257,11 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q = nvc0_query(pq);
-   boolean ret = true;
+   bool ret = true;
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->rotate) {
       nvc0_query_rotate(nvc0, q);
@@ -270,7 +270,7 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -401,7 +401,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NVC0_QUERY_STATE_READY;
       break;
    default:
@@ -422,7 +422,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
 {
    if (q->is64bit) {
@@ -442,7 +442,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nvc0_query *q = nvc0_query(pq);
    uint64_t *res64 = (uint64_t*)result;
    uint32_t *res32 = (uint32_t*)result;
-   boolean *res8 = (boolean*)result;
+   uint8_t *res8 = (uint8_t*)result;
    uint64_t *data64 = (uint64_t *)q->data;
    unsigned i;
 
@@ -450,7 +450,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
        q->type <= NVC0_QUERY_DRV_STAT_LAST) {
       res64[0] = q->u.value;
-      return TRUE;
+      return true;
    } else
 #endif
    if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
@@ -468,17 +468,17 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
             PUSH_KICK(nvc0->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
    }
    q->state = NVC0_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = q->data[1] - q->data[5];
@@ -502,7 +502,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -516,10 +516,10 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    default:
       assert(0); /* can't happen, we don't create queries with invalid type */
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -549,7 +549,7 @@ nvc0_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -563,7 +563,7 @@ nvc0_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NVC0_3D_COND_MODE_EQUAL :
                           NVC0_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
@@ -626,12 +626,12 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nvc0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean *serialize)
+                           unsigned index, bool *serialize)
 {
    struct nvc0_so_target *targ = nvc0_so_target(ptarg);
 
    if (*serialize) {
-      *serialize = FALSE;
+      *serialize = false;
       PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
       IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
 
@@ -1080,7 +1080,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    const struct nvc0_mp_pm_query_cfg *cfg;
    unsigned i, c;
    unsigned num_ab[2] = { 0, 0 };
@@ -1101,7 +1101,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
    PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
 
    if (!screen->pm.mp_counters_enabled) {
-      screen->pm.mp_counters_enabled = TRUE;
+      screen->pm.mp_counters_enabled = true;
       BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
       PUSH_DATA (push, 0x1fcb);
    }
@@ -1168,7 +1168,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    struct nvc0_screen *screen = nvc0->screen;
    struct pipe_context *pipe = &nvc0->base.pipe;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    uint32_t mask;
    uint32_t input[3];
    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
@@ -1181,7 +1181,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    if (unlikely(!screen->pm.prog)) {
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
       prog->type = PIPE_SHADER_COMPUTE;
-      prog->translated = TRUE;
+      prog->translated = true;
       prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
@@ -1249,9 +1249,9 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    }
 }
 
-static INLINE boolean
+static inline bool
 nvc0_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1264,19 +1264,19 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4],
       for (c = 0; c < cfg->num_counters; ++c) {
          if (q->data[b + 8] != q->sequence) {
             if (!wait)
-               return FALSE;
+               return false;
             if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-               return FALSE;
+               return false;
          }
          count[p][c] = q->data[b + q->ctr[c]];
       }
    }
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static inline bool
 nve4_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1291,9 +1291,9 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
             if (q->data[b + 20 + d] != q->sequence) {
                if (!wait)
-                  return FALSE;
+                  return false;
                if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-                  return FALSE;
+                  return false;
             }
             if (q->ctr[c] & ~0x3)
                count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
@@ -1302,7 +1302,7 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          }
       }
    }
-   return TRUE;
+   return true;
 }
 
 /* Metric calculations:
@@ -1325,7 +1325,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
    unsigned p, c;
    const struct nvc0_mp_pm_query_cfg *cfg;
-   boolean ret;
+   bool ret;
 
    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
 
@@ -1334,7 +1334,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    else
       ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
    if (!ret)
-      return FALSE;
+      return false;
 
    if (cfg->op == NVC0_COUNTER_OPn_SUM) {
       for (c = 0; c < cfg->num_counters; ++c)
@@ -1394,7 +1394,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    }
 
    *(uint64_t *)result = value;
-   return TRUE;
+   return true;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 56c230e42fc..ab19b26f156 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -44,16 +44,16 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER))
       if (util_format_get_blocksizebits(format) == 3 * 32)
-         return FALSE;
+         return false;
 
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
@@ -120,6 +120,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 30;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -163,7 +165,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_CONSTANT_BUFFERS:
    case PIPE_CAP_USER_INDEX_BUFFERS:
    case PIPE_CAP_USER_VERTEX_BUFFERS:
-   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
@@ -174,11 +175,16 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_COMPUTE:
       return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
 
    /* unsupported caps */
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
@@ -226,13 +232,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 
    switch (shader) {
    case PIPE_SHADER_VERTEX:
-      /*
-   case PIPE_SHADER_TESSELLATION_CONTROL:
-   case PIPE_SHADER_TESSELLATION_EVALUATION:
-      */
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
       break;
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      if (class_3d >= GM107_3D_CLASS)
+         return 0;
+      break;
    case PIPE_SHADER_COMPUTE:
       if (class_3d != NVE4_3D_CLASS)
          return 0;
@@ -341,6 +348,7 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
                               enum pipe_compute_cap param, void *data)
 {
    uint64_t *data64 = (uint64_t *)data;
+   uint32_t *data32 = (uint32_t *)data;
    const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
 
    switch (param) {
@@ -372,6 +380,9 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
    case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
       data64[0] = 4096;
       return 8;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      data32[0] = 32;
+      return 4;
    default:
       return 0;
    }
@@ -550,7 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
       /* Using COMPUTE has weird effects on 3D state, we need to
        * investigate this further before enabling it by default.
        */
-      if (debug_get_bool_option("NVC0_COMPUTE", FALSE))
+      if (debug_get_bool_option("NVC0_COMPUTE", false))
          return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
       return 0;
    case 0xe0:
@@ -564,7 +575,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
    }
 }
 
-boolean
+bool
 nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                             uint32_t lpos, uint32_t lneg, uint32_t cstack)
 {
@@ -574,7 +585,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 
    if (size >= (1 << 20)) {
       NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
 
    size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */
@@ -587,11 +598,11 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                         NULL, &bo);
    if (ret) {
       NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
    nouveau_bo_ref(NULL, &screen->tls);
    screen->tls = bo;
-   return TRUE;
+   return true;
 }
 
 #define FAIL_SCREEN_INIT(str, err)                    \
@@ -610,6 +621,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    struct nouveau_pushbuf *push;
    uint64_t value;
    uint32_t obj_class;
+   uint32_t flags;
    int ret;
    unsigned i;
 
@@ -665,8 +677,11 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param;
    screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096, NULL,
-                        &screen->fence.bo);
+   flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
+   if (dev->drm_version >= 0x01000202)
+      flags |= NOUVEAU_BO_COHERENT;
+
+   ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
    if (ret)
       goto fail;
    nouveau_bo_map(screen->fence.bo, 0, NULL);
@@ -781,7 +796,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    BEGIN_NVC0(push, NVC0_3D(COND_MODE), 1);
    PUSH_DATA (push, NVC0_3D_COND_MODE_ALWAYS);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       /* kill shaders after about 1 second (at 100 MHz) */
       BEGIN_NVC0(push, NVC0_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x17);
@@ -1012,6 +1027,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 0x20);
    BEGIN_NVC0(push, NVC0_3D(SP_SELECT(0)), 1);
    PUSH_DATA (push, 0x00);
+   screen->save_state.patch_vertices = 3;
 
    BEGIN_NVC0(push, NVC0_3D(POINT_COORD_REPLACE), 1);
    PUSH_DATA (push, 0);
@@ -1031,7 +1047,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!nvc0_blitter_create(screen))
       goto fail;
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index ef2bd43f006..d8826ae0c0d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -28,16 +28,17 @@ struct nvc0_context;
 struct nvc0_blitter;
 
 struct nvc0_graph_state {
-   boolean flushed;
-   boolean rasterizer_discard;
-   boolean early_z_forced;
-   boolean prim_restart;
+   bool flushed;
+   bool rasterizer_discard;
+   bool early_z_forced;
+   bool prim_restart;
    uint32_t instance_elts; /* bitmask of per-instance elements */
    uint32_t instance_base;
    uint32_t constant_vbos;
    uint32_t constant_elts;
    int32_t index_bias;
    uint16_t scissor;
+   uint8_t patch_vertices;
    uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
@@ -95,7 +96,7 @@ struct nvc0_screen {
       struct nvc0_program *prog; /* compute state object to read MP counters */
       struct pipe_query *mp_counter[8]; /* counter to query allocation */
       uint8_t num_mp_pm_active[2];
-      boolean mp_counters_enabled;
+      bool mp_counters_enabled;
    } pm;
 
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
@@ -105,7 +106,7 @@ struct nvc0_screen {
    struct nouveau_object *nvsw;
 };
 
-static INLINE struct nvc0_screen *
+static inline struct nvc0_screen *
 nvc0_screen(struct pipe_screen *screen)
 {
    return (struct nvc0_screen *)screen;
@@ -276,7 +277,7 @@ int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
 int nvc0_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
                                             struct pipe_driver_query_group_info *);
 
-boolean nvc0_blitter_create(struct nvc0_screen *);
+bool nvc0_blitter_create(struct nvc0_screen *);
 void nvc0_blitter_destroy(struct nvc0_screen *);
 
 void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
@@ -287,10 +288,10 @@ int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
 int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 
-boolean nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
-                                    uint32_t lneg, uint32_t cstack);
+bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
+                                 uint32_t lneg, uint32_t cstack);
 
-static INLINE void
+static inline void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nvc0_screen *screen = nvc0_screen(res->base.screen);
@@ -302,7 +303,7 @@ nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -325,21 +326,21 @@ struct nvc0_format {
 
 extern const struct nvc0_format nvc0_format_table[];
 
-static INLINE void
+static inline void
 nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -348,7 +349,7 @@ nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index e0842784a88..8aa127adc0a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -27,7 +27,7 @@
 
 #include "nvc0/nvc0_context.h"
 
-static INLINE void
+static inline void
 nvc0_program_update_context_state(struct nvc0_context *nvc0,
                                   struct nvc0_program *prog, int stage)
 {
@@ -63,22 +63,22 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    }
 }
 
-static INLINE boolean
+static inline bool
 nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
 
    if (likely(prog->code_size))
       return nvc0_program_upload_code(nvc0, prog);
-   return TRUE; /* stream output info only */
+   return true; /* stream output info only */
 }
 
 void
@@ -147,9 +147,6 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       PUSH_DATA (push, tp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
       PUSH_DATA (push, tp->num_gprs);
-
-      if (tp->tp.input_patch_size <= 32)
-         IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size);
    } else {
       BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
       PUSH_DATA (push, 0x20);
@@ -192,7 +189,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
 
    /* we allow GPs with no code for specifying stream output state only */
    if (gp && gp->code_size) {
-      const boolean gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
+      const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
 
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
       PUSH_DATA (push, 0x41);
@@ -280,7 +277,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          nvc0_query_pushbuf_submit(push, targ->pq, 0x4);
       } else {
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
-         targ->clean = FALSE;
+         targ->clean = false;
       }
    }
    for (; b < 4; ++b)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 6b7a211e71b..2a33857d9df 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -35,7 +35,7 @@
 
 #include "nouveau_gldefs.h"
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_colormask(unsigned mask)
 {
     uint32_t ret = 0;
@@ -55,7 +55,7 @@ nvc0_colormask(unsigned mask)
 #define NVC0_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -92,8 +92,8 @@ nvc0_blend_state_create(struct pipe_context *pipe,
    int r; /* reference */
    uint32_t ms;
    uint8_t blend_en = 0;
-   boolean indep_masks = FALSE;
-   boolean indep_funcs = FALSE;
+   bool indep_masks = false;
+   bool indep_funcs = false;
 
    so->pipe = *cso;
 
@@ -111,7 +111,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
              cso->rt[i].alpha_func != cso->rt[r].alpha_func ||
              cso->rt[i].alpha_src_factor != cso->rt[r].alpha_src_factor ||
              cso->rt[i].alpha_dst_factor != cso->rt[r].alpha_dst_factor) {
-            indep_funcs = TRUE;
+            indep_funcs = true;
             break;
          }
       }
@@ -120,7 +120,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
 
       for (i = 1; i < 8; ++i) {
          if (cso->rt[i].colormask != cso->rt[0].colormask) {
-            indep_masks = TRUE;
+            indep_masks = true;
             break;
          }
       }
@@ -351,6 +351,13 @@ nvc0_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, nvgl_comparison_op(cso->depth.func));
    }
 
+   SB_IMMED_3D(so, DEPTH_BOUNDS_EN, cso->depth.bounds_test);
+   if (cso->depth.bounds_test) {
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
@@ -428,7 +435,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s,
                                unsigned nr, void **hwcso)
 {
@@ -508,6 +515,14 @@ nvc0_bind_sampler_states(struct pipe_context *pipe, unsigned shader,
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 1, nr, s);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 2, nr, s);
+      break;
    case PIPE_SHADER_GEOMETRY:
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s);
@@ -537,7 +552,7 @@ nvc0_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
@@ -633,6 +648,12 @@ nvc0_set_sampler_views(struct pipe_context *pipe, unsigned shader,
    case PIPE_SHADER_VERTEX:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 1, nr, views);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 2, nr, views);
+      break;
    case PIPE_SHADER_GEOMETRY:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views);
       break;
@@ -734,6 +755,38 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso)
 }
 
 static void *
+nvc0_tcp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_CTRL);
+}
+
+static void
+nvc0_tcp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tctlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TCTLPROG;
+}
+
+static void *
+nvc0_tep_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_EVAL);
+}
+
+static void
+nvc0_tep_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tevlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TEVLPROG;
+}
+
+static void *
 nvc0_cp_state_create(struct pipe_context *pipe,
                      const struct pipe_compute_state *cso)
 {
@@ -790,7 +843,7 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res);
 
-   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nvc0->constbuf[s][i].user) {
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -934,6 +987,18 @@ nvc0_set_viewport_states(struct pipe_context *pipe,
 }
 
 static void
+nvc0_set_tess_state(struct pipe_context *pipe,
+                    const float default_tess_outer[4],
+                    const float default_tess_inner[2])
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   memcpy(nvc0->default_tess_outer, default_tess_outer, 4 * sizeof(float));
+   memcpy(nvc0->default_tess_inner, default_tess_inner, 2 * sizeof(float));
+   nvc0->dirty |= NVC0_NEW_TESSFACTOR;
+}
+
+static void
 nvc0_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned start_slot, unsigned count,
                         const struct pipe_vertex_buffer *vb)
@@ -1018,7 +1083,7 @@ nvc0_so_target_create(struct pipe_context *pipe,
       FREE(targ);
       return NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1051,13 +1116,13 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
+   bool serialize = true;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nvc0->tfbbuf[i] != targets[i];
-      const boolean append = (offsets[i] == ((unsigned)-1));
+      const bool changed = nvc0->tfbbuf[i] != targets[i];
+      const bool append = (offsets[i] == ((unsigned)-1));
       if (!changed && append)
          continue;
       nvc0->tfbbuf_dirty |= 1 << i;
@@ -1066,7 +1131,7 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
          nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
 
       if (targets[i] && !append)
-         nvc0_so_target(targets[i])->clean = TRUE;
+         nvc0_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]);
    }
@@ -1125,16 +1190,18 @@ nvc0_set_compute_resources(struct pipe_context *pipe,
 }
 
 static void
-nvc0_set_shader_resources(struct pipe_context *pipe,
-                          unsigned start, unsigned nr,
-                          struct pipe_surface **resources)
+nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                       unsigned start_slot, unsigned count,
+                       struct pipe_image_view **views)
 {
-   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources);
+#if 0
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);
 
    nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
+#endif
 }
 
-static INLINE void
+static inline void
 nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
 {
    struct nv04_resource *buf = nv04_resource(res);
@@ -1218,12 +1285,18 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->create_vs_state = nvc0_vp_state_create;
    pipe->create_fs_state = nvc0_fp_state_create;
    pipe->create_gs_state = nvc0_gp_state_create;
+   pipe->create_tcs_state = nvc0_tcp_state_create;
+   pipe->create_tes_state = nvc0_tep_state_create;
    pipe->bind_vs_state = nvc0_vp_state_bind;
    pipe->bind_fs_state = nvc0_fp_state_bind;
    pipe->bind_gs_state = nvc0_gp_state_bind;
+   pipe->bind_tcs_state = nvc0_tcp_state_bind;
+   pipe->bind_tes_state = nvc0_tep_state_bind;
    pipe->delete_vs_state = nvc0_sp_state_delete;
    pipe->delete_fs_state = nvc0_sp_state_delete;
    pipe->delete_gs_state = nvc0_sp_state_delete;
+   pipe->delete_tcs_state = nvc0_sp_state_delete;
+   pipe->delete_tes_state = nvc0_sp_state_delete;
 
    pipe->create_compute_state = nvc0_cp_state_create;
    pipe->bind_compute_state = nvc0_cp_state_bind;
@@ -1239,6 +1312,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_polygon_stipple = nvc0_set_polygon_stipple;
    pipe->set_scissor_states = nvc0_set_scissor_states;
    pipe->set_viewport_states = nvc0_set_viewport_states;
+   pipe->set_tess_state = nvc0_set_tess_state;
 
    pipe->create_vertex_elements_state = nvc0_vertex_state_create;
    pipe->delete_vertex_elements_state = nvc0_vertex_state_delete;
@@ -1253,8 +1327,14 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
 
    pipe->set_global_binding = nvc0_set_global_bindings;
    pipe->set_compute_resources = nvc0_set_compute_resources;
-   pipe->set_shader_resources = nvc0_set_shader_resources;
+   pipe->set_shader_images = nvc0_set_shader_images;
 
    nvc0->sample_mask = ~0;
    nvc0->min_samples = 1;
+   nvc0->default_tess_outer[0] =
+   nvc0->default_tess_outer[1] =
+   nvc0->default_tess_outer[2] =
+   nvc0->default_tess_outer[3] = 1.0;
+   nvc0->default_tess_inner[0] =
+   nvc0->default_tess_inner[1] = 1.0;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index c52399ab312..ce1119c284d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -55,7 +55,7 @@ nvc0_validate_zcull(struct nvc0_context *nvc0)
 }
 #endif
 
-static INLINE void
+static inline void
 nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6);
@@ -74,7 +74,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
-    boolean serialize = FALSE;
+    bool serialize = false;
 
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
 
@@ -136,7 +136,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         }
 
         if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         res->status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -168,7 +168,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         ms_mode = mt->ms_mode;
 
         if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         mt->base.status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -309,7 +309,7 @@ nvc0_validate_viewport(struct nvc0_context *nvc0)
    nvc0->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -324,7 +324,7 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
 }
 
-static INLINE void
+static inline void
 nvc0_check_program_ucps(struct nvc0_context *nvc0,
                         struct nvc0_program *vp, uint8_t mask)
 {
@@ -339,7 +339,7 @@ nvc0_check_program_ucps(struct nvc0_context *nvc0,
       nvc0_vertprog_validate(nvc0);
    else
    if (likely(vp == nvc0->gmtyprog))
-      nvc0_vertprog_validate(nvc0);
+      nvc0_gmtyprog_validate(nvc0);
    else
       nvc0_tevlprog_validate(nvc0);
 }
@@ -455,6 +455,8 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
                PUSH_DATA (push, (i << 4) | 1);
 
                BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD);
+
+               nvc0->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
                PUSH_DATA (push, (i << 4) | 0);
@@ -518,12 +520,12 @@ static void
 nvc0_validate_derived_1(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   boolean rasterizer_discard;
+   bool rasterizer_discard;
 
    if (nvc0->rast && nvc0->rast->pipe.rasterizer_discard) {
-      rasterizer_discard = TRUE;
+      rasterizer_discard = true;
    } else {
-      boolean zs = nvc0->zsa &&
+      bool zs = nvc0->zsa &&
          (nvc0->zsa->pipe.depth.enabled || nvc0->zsa->pipe.stencil[0].enabled);
       rasterizer_discard = !zs &&
          (!nvc0->fragprog || !nvc0->fragprog->hdr[18]);
@@ -535,6 +537,33 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0)
    }
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nvc0_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nvc0_validate_derived_2(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled &&
+       nvc0->framebuffer.nr_cbufs == 0) {
+      nvc0_fb_set_null_rt(push, 0);
+      BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
+static void
+nvc0_validate_tess_state(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   BEGIN_NVC0(push, NVC0_3D(TESS_LEVEL_OUTER(0)), 6);
+   PUSH_DATAp(push, nvc0->default_tess_outer, 4);
+   PUSH_DATAp(push, nvc0->default_tess_inner, 2);
+}
+
 static void
 nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 {
@@ -593,10 +622,12 @@ static struct state_validate {
     { nvc0_vertprog_validate,      NVC0_NEW_VERTPROG },
     { nvc0_tctlprog_validate,      NVC0_NEW_TCTLPROG },
     { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
+    { nvc0_validate_tess_state,    NVC0_NEW_TESSFACTOR },
     { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
     { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
+    { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
                                    NVC0_NEW_VERTPROG |
                                    NVC0_NEW_TEVLPROG |
@@ -613,7 +644,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -634,15 +665,15 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
       }
       nvc0->dirty &= ~state_mask;
 
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, FALSE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false);
    }
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d);
    ret = nouveau_pushbuf_validate(nvc0->base.pushbuf);
 
    if (unlikely(nvc0->state.flushed)) {
-      nvc0->state.flushed = FALSE;
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, TRUE);
+      nvc0->state.flushed = false;
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 1d70b7c7b23..18fcc12dea3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -29,7 +29,7 @@ struct nvc0_rasterizer_stateobj {
 struct nvc0_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[26];
+   uint32_t state[30];
 };
 
 struct nvc0_constbuf {
@@ -39,7 +39,7 @@ struct nvc0_constbuf {
    } u;
    uint32_t size;
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nvc0_vertex_element {
@@ -55,8 +55,8 @@ struct nvc0_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean shared_slots;
-   boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
+   bool shared_slots;
+   bool need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
    unsigned size; /* size of vertex in bytes (when packed) */
    struct nvc0_vertex_element element[0];
 };
@@ -65,10 +65,10 @@ struct nvc0_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
-static INLINE struct nvc0_so_target *
+static inline struct nvc0_so_target *
 nvc0_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nvc0_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index a820de7259a..51a6f93f891 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -47,8 +47,8 @@
 #define NOUVEAU_DRIVER 0xc0
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
-nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+static inline uint8_t
+nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nvc0_format_table[format].rt;
 
@@ -81,9 +81,9 @@ nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 }
 
 static int
-nvc0_2d_texture_set(struct nouveau_pushbuf *push, boolean dst,
+nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -161,16 +161,16 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
 
-   ret = nvc0_2d_texture_set(push, TRUE, dst, dst_level, dz, dfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, true, dst, dst_level, dz, dfmt, eqfmt);
    if (ret)
       return ret;
 
-   ret = nvc0_2d_texture_set(push, FALSE, src, src_level, sz, sfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, false, src, src_level, sz, sfmt, eqfmt);
    if (ret)
       return ret;
 
@@ -189,7 +189,7 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    PUSH_DATA (push, 0);
    PUSH_DATA (push, sx << src->ms_x);
    PUSH_DATA (push, 0);
-   PUSH_DATA (push, sy << src->ms_x);
+   PUSH_DATA (push, sy << src->ms_y);
 
    return 0;
 }
@@ -203,7 +203,7 @@ nvc0_resource_copy_region(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -704,7 +704,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    if (blit->screen->base.class_3d >= GM107_3D_CLASS) {
       blit->vp.code = (uint32_t *)code_gm107; /* const_cast */
       blit->vp.code_size = sizeof(code_gm107);
@@ -1217,7 +1217,7 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1376,39 +1376,40 @@ static void
 nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
-   boolean eng3d = FALSE;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   bool eng3d = false;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (util_format_is_intensity(info->src.format))
                eng3d = info->src.format != PIPE_FORMAT_I8_UNORM;
@@ -1420,30 +1421,36 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 #if 0
    /* FIXME: can't make this work with eng2d anymore, at least not on nv50 */
    if (info->src.resource->nr_samples > 1 ||
        info->dst.resource->nr_samples > 1)
-      eng3d = TRUE;
+      eng3d = true;
 #endif
    /* FIXME: find correct src coordinates adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
+
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
 
    if (!eng3d)
       nvc0_blit_eng2d(nvc0, info);
    else
       nvc0_blit_3d(nvc0, info);
 
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+
    NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_blit_count, 1);
 }
 
@@ -1453,13 +1460,13 @@ nvc0_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nvc0_blitter_create(struct nvc0_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nvc0_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
    screen->blitter->screen = screen;
 
@@ -1468,7 +1475,7 @@ nvc0_blitter_create(struct nvc0_screen *screen)
    nvc0_blitter_make_vp(screen->blitter);
    nvc0_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1491,20 +1498,20 @@ nvc0_blitter_destroy(struct nvc0_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nvc0_blitctx_create(struct nvc0_context *nvc0)
 {
    nvc0->blit = CALLOC_STRUCT(nvc0_blitctx);
    if (!nvc0->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nvc0->blit->nvc0 = nvc0;
 
    nvc0->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index ddc0409ca86..d19082e0e15 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -34,8 +34,8 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+static inline uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -82,7 +82,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt;
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -195,7 +195,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    default:
       NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
                   mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -226,7 +226,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nvc0_validate_tic(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[32];
@@ -234,12 +234,12 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    struct nouveau_bo *txc = nvc0->screen->txc;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          if (dirty)
@@ -263,7 +263,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
          BEGIN_NIC0(push, NVC0_M2MF(DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -295,18 +295,18 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-static boolean
+static bool
 nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -328,7 +328,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -356,16 +356,14 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 
 void nvc0_validate_textures(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
-
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tic(nvc0, 0);
-      need_flush |= nve4_validate_tic(nvc0, 3);
-      need_flush |= nve4_validate_tic(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tic(nvc0, 0);
-      need_flush |= nvc0_validate_tic(nvc0, 3);
-      need_flush |= nvc0_validate_tic(nvc0, 4);
+   bool need_flush = false;
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tic(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tic(nvc0, i);
    }
 
    if (need_flush) {
@@ -374,14 +372,14 @@ void nvc0_validate_textures(struct nvc0_context *nvc0)
    }
 }
 
-static boolean
+static bool
 nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[16];
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -398,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
          nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
                                65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base),
                                32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -418,13 +416,13 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-boolean
+bool
 nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -447,7 +445,7 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tsc->tsc[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -466,16 +464,14 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 
 void nvc0_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
-
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tsc(nvc0, 0);
-      need_flush |= nve4_validate_tsc(nvc0, 3);
-      need_flush |= nve4_validate_tsc(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tsc(nvc0, 0);
-      need_flush |= nvc0_validate_tsc(nvc0, 3);
-      need_flush |= nvc0_validate_tsc(nvc0, 4);
+   bool need_flush = false;
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tsc(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tsc(nvc0, i);
    }
 
    if (need_flush) {
@@ -645,13 +641,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
 }
 
-static INLINE void
+static inline void
 nve4_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index 45c6f7cc3ca..7cc5b4b1f48 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -329,17 +329,17 @@ nve4_m2mf_copy_linear(struct nouveau_context *nv,
 }
 
 
-static INLINE boolean
+static inline bool
 nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
 {
    if (mt->base.domain == NOUVEAU_BO_VRAM)
-      return FALSE;
+      return false;
    if (mt->base.base.usage != PIPE_USAGE_STAGING)
-      return FALSE;
+      return false;
    return !nouveau_bo_memtype(mt->base.bo);
 }
 
-static INLINE boolean
+static inline bool
 nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
 {
    if (!mt->base.mm) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 8cf2584b0ce..6f9e7906713 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -61,8 +61,8 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->shared_slots = FALSE;
-    so->need_conversion = FALSE;
+    so->shared_slots = false;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -93,7 +93,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nvc0_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         size = util_format_get_blocksize(fmt);
 
@@ -141,7 +141,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
 
     if (so->instance_elts || src_offset_max >= (1 << 14))
        return so;
-    so->shared_slots = TRUE;
+    so->shared_slots = true;
 
     for (i = 0; i < num_elements; ++i) {
        const unsigned b = elements[i].vertex_buffer_index;
@@ -196,7 +196,7 @@ nvc0_set_constant_vertex_attrib(struct nvc0_context *nvc0, const unsigned a)
    push->cur += 5;
 }
 
-static INLINE void
+static inline void
 nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -214,7 +214,7 @@ nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_release_user_vbufs(struct nvc0_context *nvc0)
 {
    if (nvc0->vbo_user) {
@@ -265,7 +265,7 @@ nvc0_update_user_vbufs(struct nvc0_context *nvc0)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nvc0->base.vbo_dirty = TRUE;
+   nvc0->base.vbo_dirty = true;
 }
 
 static void
@@ -419,7 +419,7 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
    uint32_t const_vbos;
    unsigned i;
    uint8_t vbo_mode;
-   boolean update_vertex;
+   bool update_vertex;
 
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 
@@ -529,7 +529,7 @@ nvc0_idxbuf_validate(struct nvc0_context *nvc0)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -547,8 +547,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }
@@ -559,7 +558,7 @@ nvc0_draw_vbo_kick_notify(struct nouveau_pushbuf *push)
 {
    struct nvc0_screen *screen = push->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
    NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
 }
@@ -695,7 +694,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten,
+nvc0_draw_elements(struct nvc0_context *nvc0, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -835,8 +834,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
 
-static INLINE void
-nvc0_update_prim_restart(struct nvc0_context *nvc0, boolean en, uint32_t index)
+static inline void
+nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
@@ -889,6 +888,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       }
    }
 
+   if (info->mode == PIPE_PRIM_PATCHES &&
+       nvc0->state.patch_vertices != info->vertices_per_patch) {
+      nvc0->state.patch_vertices = info->vertices_per_patch;
+      IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
+   }
+
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 
@@ -910,13 +915,13 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = TRUE;
+            nvc0->cb_dirty = true;
       }
    }
 
    if (nvc0->cb_dirty) {
       IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
-      nvc0->cb_dirty = FALSE;
+      nvc0->cb_dirty = false;
    }
 
    if (nvc0->state.vbo_mode) {
@@ -940,19 +945,19 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nvc0->vtxbuf[i].buffer)
          continue;
       if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
    }
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nvc0->base.vbo_dirty = TRUE;
+      nvc0->base.vbo_dirty = true;
 
    nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
 
    if (nvc0->base.vbo_dirty) {
       if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
-      nvc0->base.vbo_dirty = FALSE;
+      nvc0->base.vbo_dirty = false;
    }
 
    if (unlikely(info->indirect)) {
@@ -962,10 +967,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nvc0_draw_stream_output(nvc0, info);
    } else
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart && info->restart_index > 65535)
-         shorten = FALSE;
+         shorten = false;
 
       nvc0_draw_elements(nvc0, shorten,
                          info->mode, info->start, info->count,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index f180087161d..8b23a4887da 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -21,12 +21,12 @@ struct push_context {
    uint32_t restart_index;
    uint32_t instance_id;
 
-   boolean prim_restart;
-   boolean need_vertex_id;
+   bool prim_restart;
+   bool need_vertex_id;
 
    struct {
-      boolean enabled;
-      boolean value;
+      bool enabled;
+      bool value;
       unsigned stride;
       const uint8_t *data;
    } edgeflag;
@@ -47,7 +47,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->need_vertex_id =
       nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32);
 
-   ctx->edgeflag.value = TRUE;
+   ctx->edgeflag.value = true;
    ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS;
 
    /* silence warnings */
@@ -55,7 +55,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->edgeflag.stride = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
 {
    struct translate *translate = nvc0->vertex->translate;
@@ -78,7 +78,7 @@ nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
 {
    if (nvc0->idxbuf.buffer) {
@@ -90,7 +90,7 @@ nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
                        int32_t index_bias)
 {
@@ -112,7 +112,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
       ctx->edgeflag.data += (intptr_t)index_bias * vb->stride;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -120,7 +120,7 @@ prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -128,7 +128,7 @@ prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -136,21 +136,21 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
    return i;
 }
 
-static INLINE boolean
+static inline bool
 ef_value(const struct push_context *ctx, uint32_t index)
 {
    float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
-   return *pf ? TRUE : FALSE;
+   return *pf ? true : false;
 }
 
-static INLINE boolean
+static inline bool
 ef_toggle(struct push_context *ctx)
 {
    ctx->edgeflag.value = !ctx->edgeflag.value;
    return ctx->edgeflag.value;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
 {
    unsigned i;
@@ -158,7 +158,7 @@ ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
 {
    unsigned i;
@@ -166,7 +166,7 @@ ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
 {
    unsigned i;
@@ -174,7 +174,7 @@ ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
 {
    unsigned i;
@@ -182,7 +182,7 @@ ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
    return i;
 }
 
-static INLINE void *
+static inline void *
 nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -409,7 +409,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -427,8 +427,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }
@@ -483,7 +482,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          struct pipe_context *pipe = &nvc0->base.pipe;
          struct nvc0_so_target *targ;
          targ = nvc0_so_target(info->count_from_stream_output);
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL; /* shut up warnings */
@@ -560,7 +559,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1);
 }
 
-static INLINE void
+static inline void
 copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -568,7 +567,7 @@ copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -576,7 +575,7 @@ copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u32(uint32_t *dst, const uint32_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 725e889683f..4ea8ca3cfa2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -15,14 +15,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
                   struct nv04_resource *res, unsigned flags)
 {
@@ -38,7 +38,7 @@ nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nvc0_add_resident(bctx, NVC0_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -69,46 +69,46 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
 {
    return 0x20000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x60000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_IL(int subc, int mthd, uint16_t data)
 {
    assert(data < 0x2000);
    return 0x80000000 | (data << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_1I(int subc, int mthd, unsigned size)
 {
    return 0xa0000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
 
-static INLINE uint8_t
+static inline uint8_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nvc0.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -117,7 +117,7 @@ BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -126,7 +126,7 @@ BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -135,7 +135,7 @@ BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_1I(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 IMMED_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, uint16_t data)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index fce02a7cc57..d3e5676873e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -250,7 +250,7 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
 static void
 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush = nve4_validate_tsc(nvc0, 5);
+   bool need_flush = nve4_validate_tsc(nvc0, 5);
    if (need_flush) {
       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1);
       PUSH_DATA (nvc0->base.pushbuf, 0);
@@ -299,11 +299,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 }
 
 
-static boolean
+static bool
 nve4_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
    if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
       nve4_compute_validate_textures(nvc0);
    if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
@@ -316,15 +316,15 @@ nve4_compute_state_validate(struct nvc0_context *nvc0)
       nvc0_validate_global_residents(nvc0,
                                      nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 }
 
 
@@ -364,7 +364,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 }
 
-static INLINE uint8_t
+static inline uint8_t
 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 {
    if (shared_size > (32 << 10))
@@ -413,7 +413,7 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
 }
 
-static INLINE struct nve4_cp_launch_desc *
+static inline struct nve4_cp_launch_desc *
 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 {
@@ -505,7 +505,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -575,18 +575,18 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 {
    const uint32_t *data = (const uint32_t *)desc;
    unsigned i;
-   boolean zero = FALSE;
+   bool zero = false;
 
    debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
 
    for (i = 0; i < sizeof(*desc); i += 4) {
       if (data[i / 4]) {
          debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
-         zero = FALSE;
+         zero = false;
       } else
       if (!zero) {
          debug_printf("...\n");
-         zero = TRUE;
+         zero = true;
       }
    }
 
@@ -606,7 +606,7 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
    for (i = 0; i < 8; ++i) {
       uint64_t address;
       uint32_t size = desc->cb[i].size;
-      boolean valid = !!(desc->cb_mask & (1 << i));
+      bool valid = !!(desc->cb_mask & (1 << i));
 
       address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 4d7af54d860..7364a68a579 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -68,7 +68,7 @@ struct nve4_cp_launch_desc
    u32 unk48[16];
 };
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
 {
    memset(desc, 0, sizeof(*desc));
@@ -78,7 +78,7 @@ nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
    desc->unk47_20 = 0x300;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                            unsigned index,
                            struct nouveau_bo *bo,
@@ -96,7 +96,7 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
    desc->cb_mask |= 1 << index;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
                                unsigned index,
                                const struct nvc0_constbuf *cb)
diff --git a/src/gallium/drivers/r300/Makefile.am b/src/gallium/drivers/r300/Makefile.am
index dd1a5ede19b..081f332683e 100644
--- a/src/gallium/drivers/r300/Makefile.am
+++ b/src/gallium/drivers/r300/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index baf05cea965..6ea8f24cc14 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -382,7 +382,7 @@ static void r300_clear(struct pipe_context* pipe,
             r300_get_num_cs_end_dwords(r300);
 
         /* Reserve CS space. */
-        if (dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+        if (dwords > (r300->cs->max_dw - r300->cs->cdw)) {
             r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         }
 
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index c35aa3b24aa..8c24ad6d98a 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -94,6 +94,8 @@ static void r300_destroy_context(struct pipe_context* context)
 
     if (r300->cs)
         r300->rws->cs_destroy(r300->cs);
+    if (r300->ctx)
+        r300->rws->ctx_destroy(r300->ctx);
 
     rc_destroy_regalloc_state(&r300->fs_regalloc_state);
 
@@ -382,7 +384,11 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
                      sizeof(struct pipe_transfer), 64,
                      UTIL_SLAB_SINGLETHREADED);
 
-    r300->cs = rws->cs_create(rws, RING_GFX, r300_flush_callback, r300, NULL);
+    r300->ctx = rws->ctx_create(rws);
+    if (!r300->ctx)
+        goto fail;
+
+    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 3873c9a31c1..18ae11a3a24 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -449,6 +449,8 @@ struct r300_context {
 
     /* The interface to the windowing system, etc. */
     struct radeon_winsys *rws;
+    /* The submission context. */
+    struct radeon_winsys_ctx *ctx;
     /* The command stream. */
     struct radeon_winsys_cs *cs;
     /* Screen. */
@@ -647,32 +649,32 @@ struct r300_context {
     for (atom = r300->first_dirty; atom != r300->last_dirty; atom++)
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_query* r300_query(struct pipe_query* q)
+static inline struct r300_query* r300_query(struct pipe_query* q)
 {
     return (struct r300_query*)q;
 }
 
-static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf)
+static inline struct r300_surface* r300_surface(struct pipe_surface* surf)
 {
     return (struct r300_surface*)surf;
 }
 
-static INLINE struct r300_resource* r300_resource(struct pipe_resource* tex)
+static inline struct r300_resource* r300_resource(struct pipe_resource* tex)
 {
     return (struct r300_resource*)tex;
 }
 
-static INLINE struct r300_context* r300_context(struct pipe_context* context)
+static inline struct r300_context* r300_context(struct pipe_context* context)
 {
     return (struct r300_context*)context;
 }
 
-static INLINE struct r300_fragment_shader *r300_fs(struct r300_context *r300)
+static inline struct r300_fragment_shader *r300_fs(struct r300_context *r300)
 {
     return (struct r300_fragment_shader*)r300->fs.state;
 }
 
-static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
+static inline void r300_mark_atom_dirty(struct r300_context *r300,
                                         struct r300_atom *atom)
 {
     atom->dirty = TRUE;
@@ -688,7 +690,7 @@ static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
     }
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 r300_get_nonnull_cb(struct pipe_framebuffer_state *fb, unsigned i)
 {
     if (fb->cbufs[i])
@@ -777,12 +779,12 @@ void r300_update_derived_state(struct r300_context* r300);
 void r500_dump_rs_block(struct r300_rs_block *rs);
 
 
-static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
+static inline boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
     return SCREEN_DBG_ON(ctx->screen, flags);
 }
 
-static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+static inline void CTX_DBG(struct r300_context * ctx, unsigned flags,
                        const char * fmt, ...)
 {
     if (CTX_DBG_ON(ctx, flags)) {
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 37f9641ab3e..fc150542d4b 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -46,7 +46,7 @@
 #ifdef DEBUG
 
 #define BEGIN_CS(size) do { \
-    assert(size <= (RADEON_MAX_CMDBUF_DWORDS - cs_copy->cdw)); \
+    assert(size <= (cs_copy->max_dw - cs_copy->cdw)); \
     cs_count = size; \
 } while (0)
 
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 39eb73da65d..b39624dad5f 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -77,14 +77,14 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
 /* Return TRUE if the shader was switched and should be re-emitted. */
 boolean r300_pick_fragment_shader(struct r300_context* r300);
 
-static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
     return (fs->shader->code.writes_depth) ? TRUE : FALSE;
 }
 
-static INLINE boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 01b83b87fcf..4dd8156f616 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -146,10 +146,11 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
 
     if (q->type == PIPE_QUERY_GPU_FINISHED) {
         if (wait) {
-            r300->rws->buffer_wait(q->buf, RADEON_USAGE_READWRITE);
+            r300->rws->buffer_wait(q->buf, PIPE_TIMEOUT_INFINITE,
+                                   RADEON_USAGE_READWRITE);
             vresult->b = TRUE;
         } else {
-            vresult->b = !r300->rws->buffer_is_busy(q->buf, RADEON_USAGE_READWRITE);
+            vresult->b = r300->rws->buffer_wait(q->buf, 0, RADEON_USAGE_READWRITE);
         }
         return vresult->b;
     }
@@ -168,8 +169,6 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
         map++;
     }
 
-    r300->rws->buffer_unmap(q->cs_buf);
-
     if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
         vresult->b = temp != 0;
     } else {
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 4c951d14f10..0487b11e775 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -215,7 +215,7 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
     cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
-    if (cs_dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+    if (cs_dwords > (r300->cs->max_dw - r300->cs->cdw)) {
         r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         flushed = TRUE;
     }
@@ -871,7 +871,7 @@ struct r300_render {
     uint8_t *vbo_ptr;
 };
 
-static INLINE struct r300_render*
+static inline struct r300_render*
 r300_render(struct vbuf_render* render)
 {
     return (struct r300_render*)render;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a7bca915f57..4ca0b268bde 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -191,6 +191,10 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
             return 0;
 
         /* SWTCL-only features. */
@@ -427,7 +431,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
  * Whether the format matches:
  *   PIPE_FORMAT_?10?10?10?2_UNORM
  */
-static INLINE boolean
+static inline boolean
 util_format_is_rgba1010102_variant(const struct util_format_description *desc)
 {
    static const unsigned size[4] = {10, 10, 10, 2};
@@ -660,14 +664,6 @@ static void r300_fence_reference(struct pipe_screen *screen,
     rws->fence_reference(ptr, fence);
 }
 
-static boolean r300_fence_signalled(struct pipe_screen *screen,
-                                    struct pipe_fence_handle *fence)
-{
-    struct radeon_winsys *rws = r300_screen(screen)->rws;
-
-    return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r300_fence_finish(struct pipe_screen *screen,
                                  struct pipe_fence_handle *fence,
                                  uint64_t timeout)
@@ -712,7 +708,6 @@ struct pipe_screen* r300_screen_create(struct radeon_winsys *rws)
     r300screen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
     r300screen->screen.context_create = r300_create_context;
     r300screen->screen.fence_reference = r300_fence_reference;
-    r300screen->screen.fence_signalled = r300_fence_signalled;
     r300screen->screen.fence_finish = r300_fence_finish;
 
     r300_init_screen_resource_functions(r300screen);
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 7bba39bf12b..e15c3c7de0c 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -51,11 +51,11 @@ struct r300_screen {
 
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_screen* r300_screen(struct pipe_screen* screen) {
+static inline struct r300_screen* r300_screen(struct pipe_screen* screen) {
     return (struct r300_screen*)screen;
 }
 
-static INLINE struct radeon_winsys *
+static inline struct radeon_winsys *
 radeon_winsys(struct pipe_screen *screen) {
     return r300_screen(screen)->rws;
 }
@@ -102,12 +102,12 @@ radeon_winsys(struct pipe_screen *screen) {
 #define DBG_P_STAT      (1 << 25)
 /*@}*/
 
-static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+static inline boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
 {
     return (screen->debug & flags) ? TRUE : FALSE;
 }
 
-static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+static inline void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
                               const char * fmt, ...)
 {
     if (SCREEN_DBG_ON(screen, flags)) {
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index de557b57776..6451a2c8df2 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -96,7 +96,7 @@ r300_buffer_transfer_map( struct pipe_context *context,
 
         /* Check if mapping this buffer would cause waiting for the GPU. */
         if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) ||
-            r300->rws->buffer_is_busy(rbuf->buf, RADEON_USAGE_READWRITE)) {
+            !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) {
             unsigned i;
             struct pb_buffer *new_buf;
 
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
index b4c8520039b..14b849c8c93 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.h
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -46,7 +46,7 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
 
 /* Inline functions. */
 
-static INLINE struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
+static inline struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
 {
     return (struct r300_buffer *)buffer;
 }
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index b756048c6c7..93bbc9d4a96 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -46,7 +46,7 @@ struct r300_shader_semantics {
     int num_generic;
 };
 
-static INLINE void r300_shader_semantics_reset(
+static inline void r300_shader_semantics_reset(
     struct r300_shader_semantics* info)
 {
     int i;
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index e886df87a60..d99d5ae0152 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -844,7 +844,7 @@ static void r300_tex_set_tiling_flags(struct r300_context *r300,
         tex->tex.macrotile[level]) {
         r300->rws->buffer_set_tiling(tex->buf, r300->cs,
                 tex->tex.microtile, tex->tex.macrotile[level],
-                0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0,
                 tex->tex.stride_in_bytes[0], false);
 
         tex->surface_level = level;
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index feec494c4dc..fbd91cda9fe 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -32,13 +32,13 @@
 
 /* Some maths. These should probably find their way to u_math, if needed. */
 
-static INLINE int pack_float_16_6x(float f) {
+static inline int pack_float_16_6x(float f) {
     return ((int)(f * 6.0) & 0xffff);
 }
 
 /* Blend state. */
 
-static INLINE uint32_t r300_translate_blend_function(int blend_func,
+static inline uint32_t r300_translate_blend_function(int blend_func,
                                                      boolean clamp)
 {
     switch (blend_func) {
@@ -60,7 +60,7 @@ static INLINE uint32_t r300_translate_blend_function(int blend_func,
     return 0;
 }
 
-static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
+static inline uint32_t r300_translate_blend_factor(int blend_fact)
 {
     switch (blend_fact) {
         case PIPE_BLENDFACTOR_ONE:
@@ -113,7 +113,7 @@ static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
 
 /* DSA state. */
 
-static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
+static inline uint32_t r300_translate_depth_stencil_function(int zs_func)
 {
     switch (zs_func) {
         case PIPE_FUNC_NEVER:
@@ -141,7 +141,7 @@ static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_stencil_op(int s_op)
+static inline uint32_t r300_translate_stencil_op(int s_op)
 {
     switch (s_op) {
         case PIPE_STENCIL_OP_KEEP:
@@ -168,7 +168,7 @@ static INLINE uint32_t r300_translate_stencil_op(int s_op)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
+static inline uint32_t r300_translate_alpha_function(int alpha_func)
 {
     switch (alpha_func) {
         case PIPE_FUNC_NEVER:
@@ -195,7 +195,7 @@ static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
     return 0;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_front(unsigned mode) {
     switch (mode)
     {
@@ -213,7 +213,7 @@ r300_translate_polygon_mode_front(unsigned mode) {
     }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_back(unsigned mode) {
     switch (mode)
     {
@@ -233,7 +233,7 @@ r300_translate_polygon_mode_back(unsigned mode) {
 
 /* Texture sampler state. */
 
-static INLINE uint32_t r300_translate_wrap(int wrap)
+static inline uint32_t r300_translate_wrap(int wrap)
 {
     switch (wrap) {
         case PIPE_TEX_WRAP_REPEAT:
@@ -259,7 +259,7 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+static inline uint32_t r300_translate_tex_filters(int min, int mag, int mip,
                                                   boolean is_anisotropic)
 {
     uint32_t retval = 0;
@@ -308,7 +308,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
     return retval;
 }
 
-static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
+static inline uint32_t r300_anisotropy(unsigned max_aniso)
 {
     if (max_aniso >= 16) {
         return R300_TX_MAX_ANISO_16_TO_1;
@@ -323,7 +323,7 @@ static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
     }
 }
 
-static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
+static inline uint32_t r500_anisotropy(unsigned max_aniso)
 {
     if (!max_aniso) {
         return 0;
@@ -336,7 +336,7 @@ static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
 }
 
 /* Translate pipe_formats into PSC vertex types. */
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_type(enum pipe_format format) {
     uint32_t result = 0;
     const struct util_format_description *desc;
@@ -410,7 +410,7 @@ r300_translate_vertex_data_type(enum pipe_format format) {
     return result;
 }
 
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_swizzle(enum pipe_format format) {
     const struct util_format_description *desc;
     unsigned i, swizzle = 0;
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 6c01c0d21e4..5e4d50df27d 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1063,7 +1063,7 @@ r300_texture_create_object(struct r300_screen *rscreen,
 
     rws->buffer_set_tiling(tex->buf, NULL,
             tex->tex.microtile, tex->tex.macrotile[0],
-            0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0,
             tex->tex.stride_in_bytes[0], false);
 
     return tex;
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index b87164ba836..44303792f51 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -41,7 +41,7 @@ struct r300_transfer {
 };
 
 /* Convenience cast wrapper. */
-static INLINE struct r300_transfer*
+static inline struct r300_transfer*
 r300_transfer(struct pipe_transfer* transfer)
 {
     return (struct r300_transfer*)transfer;
@@ -120,7 +120,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
         referenced_hw = TRUE;
     } else {
         referenced_hw =
-            r300->rws->buffer_is_busy(tex->buf, RADEON_USAGE_READWRITE);
+            !r300->rws->buffer_wait(tex->buf, 0, RADEON_USAGE_READWRITE);
     }
 
     trans = CALLOC_STRUCT(r300_transfer);
@@ -251,16 +251,12 @@ void r300_texture_transfer_unmap(struct pipe_context *ctx,
     struct r300_resource *tex = r300_resource(transfer->resource);
 
     if (trans->linear_texture) {
-        rws->buffer_unmap(trans->linear_texture->cs_buf);
-
         if (transfer->usage & PIPE_TRANSFER_WRITE) {
             r300_copy_into_tiled_texture(ctx, trans);
         }
 
         pipe_resource_reference(
             (struct pipe_resource**)&trans->linear_texture, NULL);
-    } else {
-        rws->buffer_unmap(tex->cs_buf);
     }
     FREE(transfer);
 }
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index dc0d90d759b..8317da727a2 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 295cb4d80b7..42e8b0b1761 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -160,6 +160,9 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 	alu.op = ALU_OP1_MOVA_INT;
 	alu.src[0].sel = bc->index_reg[id];
 	alu.src[0].chan = 0;
+	if (bc->chip_class == CAYMAN)
+		alu.dst.sel = id == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+
 	alu.last = 1;
 	r = r600_bytecode_add_alu(bc, &alu);
 	if (r)
@@ -167,12 +170,14 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 
 	bc->ar_loaded = 0; /* clobbered */
 
-	memset(&alu, 0, sizeof(alu));
-	alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
-	alu.last = 1;
-	r = r600_bytecode_add_alu(bc, &alu);
-	if (r)
-		return r;
+	if (bc->chip_class == EVERGREEN) {
+		memset(&alu, 0, sizeof(alu));
+		alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
 
 	/* Must split ALU group as index only applies to following group */
 	if (inside_alu_clause) {
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index b534872f062..97e230f56c7 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -521,4 +521,11 @@
 
 #define V_SQ_REL_ABSOLUTE 0
 #define V_SQ_REL_RELATIVE 1
+
+/* CAYMAN has special encoding for MOVA_INT destination */
+#define CM_V_SQ_MOVA_DST_AR_X 0
+#define CM_V_SQ_MOVA_DST_CF_PC 1
+#define CM_V_SQ_MOVA_DST_CF_IDX0 2
+#define CM_V_SQ_MOVA_DST_CF_IDX1 3
+
 #endif
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 4c3c34cd664..c52e43e9c2a 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -163,7 +163,7 @@ static void evergreen_cs_set_vertex_buffer(
 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 	state->enabled_mask |= 1 << vb_index;
 	state->dirty_mask |= 1 << vb_index;
-	state->atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void evergreen_cs_set_constant_buffer(
@@ -226,7 +226,7 @@ void *evergreen_create_compute_state(
 	}
 #else
 	memset(&shader->binary, 0, sizeof(shader->binary));
-	radeon_elf_read(code, header->num_bytes, &shader->binary, true);
+	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
 	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
@@ -487,6 +487,12 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* Emit constant buffer state */
 	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 
+	/* Emit sampler state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
+
+	/* Emit sampler view (texture resource) state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
+
 	/* Emit compute shader state */
 	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 
@@ -655,25 +661,6 @@ static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 	}
 }
 
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-		unsigned start_slot, unsigned count,
-		struct pipe_sampler_view **views)
-{
-	struct r600_pipe_sampler_view **resource =
-		(struct r600_pipe_sampler_view **)views;
-
-	for (unsigned i = 0; i < count; i++)	{
-		if (resource[i]) {
-			assert(i+1 < 12);
-			/* XXX: Implement */
-			assert(!"Compute samplers not implemented.");
-			///FETCH0 = VTX0 (param buffer),
-			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
-		}
-	}
-}
-
-
 static void evergreen_set_global_binding(
 	struct pipe_context *ctx_, unsigned first, unsigned n,
 	struct pipe_resource **resources,
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 4ddbc0beba5..6a91d4709f4 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -32,7 +32,7 @@
 #include "evergreen_compute.h"
 #include "util/u_math.h"
 
-static INLINE unsigned evergreen_array_mode(unsigned mode)
+static inline unsigned evergreen_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_028C70_ARRAY_LINEAR_ALIGNED;
@@ -485,7 +485,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {
@@ -896,7 +896,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_scissors; i++) {
 		rctx->scissor[i].scissor = state[i - start_slot];
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1028,7 +1028,10 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	macro_aspect = rtex->surface.mtilea;
 	bankw = rtex->surface.bankw;
 	bankh = rtex->surface.bankh;
-	fmask_bankh = rtex->fmask.bank_height;
+	if (rtex->fmask.size)
+		fmask_bankh = rtex->fmask.bank_height;
+	else
+		fmask_bankh = rtex->surface.bankh;
 	tile_split = eg_tile_split(tile_split);
 	macro_aspect = eg_macro_tile_aspect(macro_aspect);
 	bankw = eg_bank_wh(bankw);
@@ -1149,10 +1152,11 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	surf->cb_color_attrib = color_attrib;
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (base_offset + rtex->fmask.offset) >> 8;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 	} else {
 		surf->cb_color_fmask = surf->cb_color_base;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(slice);
 	}
-	surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 
 	surf->color_initialized = true;
 }
@@ -1342,11 +1346,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 		if (rctx->alphatest_state.cb0_export_16bpc != export_16bpc) {
 			rctx->alphatest_state.cb0_export_16bpc = export_16bpc;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1362,28 +1366,28 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	log_samples = util_logbase2(rctx->framebuffer.nr_samples);
@@ -1392,7 +1396,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	     rctx->b.family == CHIP_RV770) &&
 	    rctx->db_misc_state.log_samples != log_samples) {
 		rctx->db_misc_state.log_samples = log_samples;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 
@@ -1420,7 +1424,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 4;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1434,7 +1438,7 @@ static void evergreen_set_min_samples(struct pipe_context *ctx, unsigned min_sam
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->framebuffer.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 	}
 }
 
@@ -1732,10 +1736,10 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_
 
 	r600_write_context_reg_seq(cs, R_028238_CB_TARGET_MASK, 2);
 	radeon_emit(cs, a->blend_colormask & fb_colormask); /* R_028238_CB_TARGET_MASK */
-	/* Always enable the first colorbuffer in CB_SHADER_MASK. This
-	 * will assure that the alpha-test will work even if there is
-	 * no colorbuffer bound. */
-	radeon_emit(cs, 0xf | (a->dual_src_blend ? ps_colormask : 0) | fb_colormask); /* R_02823C_CB_SHADER_MASK */
+	/* This must match the used export instructions exactly.
+	 * Other values may lead to undefined behavior and hangs.
+	 */
+	radeon_emit(cs, ps_colormask); /* R_02823C_CB_SHADER_MASK */
 }
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
@@ -1980,7 +1984,7 @@ static void evergreen_emit_cs_constant_buffers(struct r600_context *rctx, struct
 
 static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					 struct r600_samplerview_state *state,
-					 unsigned resource_id_base)
+					 unsigned resource_id_base, unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
@@ -1993,7 +1997,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 		rview = state->views[resource_index];
 		assert(rview);
 
-		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + resource_index) * 8);
 		radeon_emit_array(cs, rview->tex_resource_words, 8);
 
@@ -2002,11 +2006,11 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					      rview->tex_resource->b.b.nr_samples > 1 ?
 						      RADEON_PRIO_SHADER_TEXTURE_MSAA :
 						      RADEON_PRIO_SHADER_TEXTURE_RO);
-		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 		radeon_emit(cs, reloc);
 
 		if (!rview->skip_mip_address_reloc) {
-			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 			radeon_emit(cs, reloc);
 		}
 	}
@@ -2015,23 +2019,33 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views, 176 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views,
+	                             176 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_gs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views, 336 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views,
+	                             336 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_ps_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views, R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views,
+	                             R600_MAX_CONST_BUFFERS, 0);
+}
+
+static void evergreen_emit_cs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views,
+	                             816 + 2, RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				struct r600_textures_info *texinfo,
 				unsigned resource_id_base,
-				unsigned border_index_reg)
+				unsigned border_index_reg,
+				unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
@@ -2043,7 +2057,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 		rstate = texinfo->states.states[i];
 		assert(rstate);
 
-		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + i) * 3);
 		radeon_emit_array(cs, rstate->tex_sampler_words, 3);
 
@@ -2058,17 +2072,27 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18, R_00A414_TD_VS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18,
+	                              R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_gs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36, R_00A428_TD_GS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36,
+	                              R_00A428_TD_GS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_ps_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0, R_00A400_TD_PS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0,
+	                              R_00A400_TD_PS_SAMPLER0_BORDER_INDEX, 0);
+}
+
+static void evergreen_emit_cs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE], 90,
+	                              R_00A464_TD_CS_SAMPLER0_BORDER_INDEX,
+	                              RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
@@ -3176,7 +3200,7 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
@@ -3431,12 +3455,14 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].states.atom, id++, evergreen_emit_vs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].states.atom, id++, evergreen_emit_gs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].states.atom, id++, evergreen_emit_ps_sampler_states, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom, id++, evergreen_emit_cs_sampler_states, 0);
 	/* resources */
 	r600_init_atom(rctx, &rctx->vertex_buffer_state.atom, id++, evergreen_fs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->cs_vertex_buffer_state.atom, id++, evergreen_cs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views.atom, id++, evergreen_emit_vs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views.atom, id++, evergreen_emit_gs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views.atom, id++, evergreen_emit_ps_sampler_views, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom, id++, evergreen_emit_cs_sampler_views, 0);
 
 	r600_init_atom(rctx, &rctx->vgt_state.atom, id++, r600_emit_vgt_state, 10);
 
@@ -3466,8 +3492,8 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	}
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index cd4ff46b103..ad6ad434b78 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1253,6 +1253,11 @@
 #define R_00A430_TD_GS_SAMPLER0_BORDER_GREEN         0x00A430
 #define R_00A434_TD_GS_SAMPLER0_BORDER_BLUE          0x00A434
 #define R_00A438_TD_GS_SAMPLER0_BORDER_ALPHA         0x00A438
+#define R_00A464_TD_CS_SAMPLER0_BORDER_INDEX         0x00A464
+#define R_00A468_TD_CS_SAMPLER0_BORDER_RED           0x00A468
+#define R_00A46C_TD_CS_SAMPLER0_BORDER_GREEN         0x00A46C
+#define R_00A470_TD_CS_SAMPLER0_BORDER_BLUE          0x00A470
+#define R_00A474_TD_CS_SAMPLER0_BORDER_ALPHA         0x00A474
 
 #define R_03C000_SQ_TEX_SAMPLER_WORD0_0              0x03C000
 #define   S_03C000_CLAMP_X(x)                          (((x) & 0x7) << 0)
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 01262a59e90..b0002c3b50f 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -145,7 +145,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 	rctx->db_misc_state.copy_depth = util_format_has_depth(desc);
 	rctx->db_misc_state.copy_stencil = util_format_has_stencil(desc);
 	rctx->db_misc_state.copy_sample = first_sample;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	for (level = first_level; level <= last_level; level++) {
 		if (!staging && !(texture->dirty_level_mask & (1 << level)))
@@ -162,7 +162,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 				if (sample != rctx->db_misc_state.copy_sample) {
 					rctx->db_misc_state.copy_sample = sample;
-					rctx->db_misc_state.atom.dirty = true;
+					r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 				}
 
 				surf_tmpl.format = texture->resource.b.b.format;
@@ -197,7 +197,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 	/* reenable compression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_through_cb = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
@@ -210,7 +210,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Enable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = true;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -248,7 +248,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Disable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 void r600_decompress_depth_textures(struct r600_context *rctx,
@@ -396,6 +396,8 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN) {
 		evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {
@@ -435,10 +437,10 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
                    fb->zsbuf->u.tex.last_layer == util_max_layer(&rtex->resource.b.b, level)) {
 			if (rtex->depth_clear_value != depth) {
 				rtex->depth_clear_value = depth;
-				rctx->db_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
 			}
 			rctx->db_misc_state.htile_clear = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -451,7 +453,7 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	/* disable fast clear */
 	if (rctx->db_misc_state.htile_clear) {
 		rctx->db_misc_state.htile_clear = false;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600_formats.h b/src/gallium/drivers/r600/r600_formats.h
index fa374d92e6f..9533aaa1378 100644
--- a/src/gallium/drivers/r600/r600_formats.h
+++ b/src/gallium/drivers/r600/r600_formats.h
@@ -64,7 +64,7 @@
 #define     ENDIAN_8IN32                    2
 #define     ENDIAN_8IN64                    3
 
-static INLINE unsigned r600_endian_swap(unsigned size)
+static inline unsigned r600_endian_swap(unsigned size)
 {
 	if (R600_BIG_ENDIAN) {
 		switch (size) {
@@ -82,7 +82,7 @@ static INLINE unsigned r600_endian_swap(unsigned size)
 	}
 }
 
-static INLINE bool r600_is_vertex_format_supported(enum pipe_format format)
+static inline bool r600_is_vertex_format_supported(enum pipe_format format)
 {
 	const struct util_format_description *desc = util_format_description(format);
 	unsigned i;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 8eb0c6806b9..64451516c23 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		unsigned i;
 
 		/* The number of dwords all the dirty states would take. */
-		for (i = 0; i < R600_NUM_ATOMS; i++) {
-			if (ctx->atoms[i] && ctx->atoms[i]->dirty) {
-				num_dw += ctx->atoms[i]->num_dw;
-				if (ctx->screen->b.trace_bo) {
-					num_dw += R600_TRACE_CS_DWORDS;
-				}
+		i = r600_next_dirty_atom(ctx, 0);
+		while (i < R600_NUM_ATOMS) {
+			num_dw += ctx->atoms[i]->num_dw;
+			if (ctx->screen->b.trace_bo) {
+				num_dw += R600_TRACE_CS_DWORDS;
 			}
+			i = r600_next_dirty_atom(ctx, i + 1);
 		}
 
 		/* The upper-bound of how much space a draw command would take. */
@@ -68,7 +68,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
@@ -92,7 +93,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
@@ -295,43 +296,45 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd);
 
 	/* Re-emit states. */
-	ctx->alphatest_state.atom.dirty = true;
-	ctx->blend_color.atom.dirty = true;
-	ctx->cb_misc_state.atom.dirty = true;
-	ctx->clip_misc_state.atom.dirty = true;
-	ctx->clip_state.atom.dirty = true;
-	ctx->db_misc_state.atom.dirty = true;
-	ctx->db_state.atom.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->pixel_shader.atom.dirty = true;
-	ctx->poly_offset_state.atom.dirty = true;
-	ctx->vgt_state.atom.dirty = true;
-	ctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->blend_color.atom);
+	r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	r600_mark_atom_dirty(ctx, &ctx->pixel_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
 	for (i = 0; i < R600_MAX_VIEWPORTS; i++) {
-		ctx->scissor[i].atom.dirty = true;
-		ctx->viewport[i].atom.dirty = true;
-	}
-	ctx->config_state.atom.dirty = true;
-	ctx->stencil_ref.atom.dirty = true;
-	ctx->vertex_fetch_shader.atom.dirty = true;
-	ctx->export_shader.atom.dirty = true;
-	ctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->scissor[i].atom);
+		r600_mark_atom_dirty(ctx, &ctx->viewport[i].atom);
+	}
+	if (ctx->b.chip_class < EVERGREEN) {
+		r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
+	}
+	r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->export_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom);
 	if (ctx->gs_shader) {
-		ctx->geometry_shader.atom.dirty = true;
-		ctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->geometry_shader.atom);
+		r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom);
 	}
-	ctx->vertex_shader.atom.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 
 	if (ctx->blend_state.cso)
-		ctx->blend_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
 	if (ctx->dsa_state.cso)
-		ctx->dsa_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom);
 	if (ctx->rasterizer_state.cso)
-		ctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom);
 
 	if (ctx->b.chip_class <= R700) {
-		ctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom);
 	}
 
 	ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask;
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 72e2dc42f7e..faf538ccbb5 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -84,7 +84,7 @@ static void llvm_load_system_value(
 #else
 	LLVMValueRef reg = lp_build_const_int32(
 			ctx->soa.bld_base.base.gallivm, chan);
-	ctx->system_values[index] = build_intrinsic(
+	ctx->system_values[index] = lp_build_intrinsic(
 			ctx->soa.bld_base.base.gallivm->builder,
 			"llvm.R600.load.input",
 			ctx->soa.bld_base.base.elem_type, &reg, 1,
@@ -111,9 +111,9 @@ llvm_load_input_vector(
 			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
 				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), "");
 			LLVMValueRef HalfVec[2] = {
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute),
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute)
 			};
 			LLVMValueRef MaskInputs[4] = {
@@ -127,7 +127,7 @@ llvm_load_input_vector(
 				Mask, "");
 		} else {
 			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4);
-			return build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
+			return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
 				VecType, Args, ArgCount, LLVMReadNoneAttribute);
 		}
 }
@@ -153,7 +153,7 @@ llvm_load_input_helper(
 		arg_count = 1;
 	}
 
-	return build_intrinsic(bb->gallivm->builder, intrinsic,
+	return lp_build_intrinsic(bb->gallivm->builder, intrinsic,
 		bb->elem_type, &arg[0], arg_count, LLVMReadNoneAttribute);
 }
 #endif
@@ -332,7 +332,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer);
 			args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component);
 			lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output",
-				LLVMVoidTypeInContext(base->gallivm->context), args, 4);
+				LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0);
 		}
 	}
 
@@ -356,7 +356,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -373,7 +373,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
 						args[0] = output;
 						args[1] = base_vector;
-						adjusted_elements[chan] = build_intrinsic(base->gallivm->builder,
+						adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder,
 							"llvm.AMDGPU.dp4", bld_base->base.elem_type,
 							args, 2, LLVMReadNoneAttribute);
 					}
@@ -381,7 +381,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						adjusted_elements, 4);
 					args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 					args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-					build_intrinsic(
+					lp_build_intrinsic(
 						base->gallivm->builder,
 						"llvm.R600.store.swizzle",
 						LLVMVoidTypeInContext(base->gallivm->context),
@@ -394,14 +394,14 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
 					args, 3, 0);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -418,7 +418,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = lp_build_gather_values(base->gallivm, elements, 4);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -430,7 +430,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -449,7 +449,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						for (unsigned j = 0; j < ctx->color_buffer_count; j++) {
 							args[1] = lp_build_const_int32(base->gallivm, j);
 							args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-							build_intrinsic(
+							lp_build_intrinsic(
 								base->gallivm->builder,
 								"llvm.R600.store.swizzle",
 								LLVMVoidTypeInContext(base->gallivm->context),
@@ -458,7 +458,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 					} else {
 						args[1] = lp_build_const_int32(base->gallivm, color_count++);
 						args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-						build_intrinsic(
+						lp_build_intrinsic(
 							base->gallivm->builder,
 							"llvm.R600.store.swizzle",
 							LLVMVoidTypeInContext(base->gallivm->context),
@@ -543,7 +543,7 @@ static void llvm_emit_tex(
 		case TGSI_OPCODE_TXF: {
 			args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), "");
 			args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS);
-			emit_data->output[0] = build_intrinsic(gallivm->builder,
+			emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 							"llvm.R600.load.texbuf",
 							emit_data->dst_type, args, 2, LLVMReadNoneAttribute);
 			if (ctx->chip_class >= EVERGREEN)
@@ -658,7 +658,7 @@ static void llvm_emit_tex(
 				lp_build_const_int32(gallivm, 1),
 				lp_build_const_int32(gallivm, 1)
 			};
-			LLVMValueRef ptr = build_intrinsic(gallivm->builder,
+			LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder,
 				"llvm.R600.ldptr",
 				emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute);
 			LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0],
@@ -679,7 +679,7 @@ static void llvm_emit_tex(
 		}
 	}
 
-	emit_data->output[0] = build_intrinsic(gallivm->builder,
+	emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 					action->intr_name,
 					emit_data->dst_type, args, c, LLVMReadNoneAttribute);
 
@@ -754,7 +754,131 @@ static struct lp_build_tgsi_action dot_action = {
 	.intr_name = "llvm.AMDGPU.dp4"
 };
 
+static void txd_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
+	LLVMValueRef coords[4];
+	unsigned chan, src;
+	for (src = 0; src < 3; src++) {
+		for (chan = 0; chan < 4; chan++)
+			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
+
+		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
+				coords, 4);
+	}
+	emit_data->arg_count = 3;
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+
+static void txp_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef src_w;
+	unsigned chan;
+	LLVMValueRef coords[5];
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+
+	for (chan = 0; chan < 3; chan++ ) {
+		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_llvm_binary(bld_base,
+					TGSI_OPCODE_DIV, arg, src_w);
+	}
+	coords[3] = bld_base->base.one;
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
 
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
+}
+
+static void tex_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
+	LLVMValueRef coords[5];
+	unsigned chan;
+	for (chan = 0; chan < 4; chan++) {
+		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
+	}
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
+		/* These instructions have additional operand that should be packed
+		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
+		 * That operand should be passed as a float value in the args array
+		 * right after the coord vector. After packing it's not used anymore,
+		 * that's why arg_count is not increased */
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+	}
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
+
+	emit_data->arg_count = 1;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+static void txf_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+	const struct tgsi_texture_offset * off = inst->TexOffsets;
+	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
+
+	/* fetch tex coords */
+	tex_fetch_args(bld_base, emit_data);
+
+	/* fetch tex offsets */
+	if (inst->Texture.NumOffsets) {
+		assert(inst->Texture.NumOffsets == 1);
+
+		emit_data->args[1] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleX],
+			offset_type);
+		emit_data->args[2] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleY],
+			offset_type);
+		emit_data->args[3] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleZ],
+			offset_type);
+	} else {
+		emit_data->args[1] = bld_base->int_bld.zero;
+		emit_data->args[2] = bld_base->int_bld.zero;
+		emit_data->args[3] = bld_base->int_bld.zero;
+	}
+
+	emit_data->arg_count = 4;
+}
 
 LLVMModuleRef r600_tgsi_llvm(
 	struct radeon_llvm_context * ctx,
@@ -783,7 +907,6 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
 	bld_base->emit_prologue = llvm_emit_prologue;
 	bld_base->emit_epilogue = llvm_emit_epilogue;
-	ctx->userdata = ctx;
 	ctx->load_input = llvm_load_input;
 	ctx->load_system_value = llvm_load_system_value;
 
@@ -791,18 +914,42 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
+	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
 	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
+	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
 	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
 	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt;
 
 	lp_build_tgsi_llvm(bld_base, tokens);
@@ -881,7 +1028,7 @@ unsigned r600_llvm_compile(
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e122b607b86..6ffe5615fbf 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -120,6 +120,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 	rctx->b.b.screen = screen;
 	rctx->b.b.priv = priv;
 	rctx->b.b.destroy = r600_destroy_context;
+	rctx->b.set_atom_dirty = (void *)r600_set_atom_dirty;
 
 	if (!r600_common_context_init(&rctx->b, &rscreen->b))
 		goto fail;
@@ -176,7 +177,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 		goto fail;
 	}
 
-	rctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX,
+	rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
 					     r600_context_gfx_flush, rctx,
 					     rscreen->b.trace_bo ?
 						     rscreen->b.trace_bo->cs_buf : NULL);
@@ -268,8 +269,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SAMPLE_SHADING:
 	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return 1;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return rscreen->b.info.drm_major == 2 && rscreen->b.info.drm_minor >= 43;
+
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !R600_BIG_ENDIAN && rscreen->b.info.has_userptr;
 
@@ -329,10 +336,10 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 4ea270d3839..9b66105641a 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -36,7 +36,7 @@
 #include "util/list.h"
 #include "util/u_transfer.h"
 
-#define R600_NUM_ATOMS 73
+#define R600_NUM_ATOMS 75
 
 #define R600_MAX_VIEWPORTS 16
 
@@ -85,6 +85,9 @@
 #define R600_BIG_ENDIAN 0
 #endif
 
+#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8)
+#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, R600_DIRTY_ATOM_WORD_BITS)
+
 struct r600_context;
 struct r600_bytecode;
 struct r600_shader_key;
@@ -426,6 +429,8 @@ struct r600_context {
 
 	/* State binding slots are here. */
 	struct r600_atom		*atoms[R600_NUM_ATOMS];
+	/* Dirty atom bitmask for fast tests */
+	unsigned long			dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN];
 	/* States for CS initialization. */
 	struct r600_command_buffer	start_cs_cmd; /* invariant state mostly */
 	/** Compute specific registers initializations.  The start_cs_cmd atom
@@ -490,37 +495,92 @@ struct r600_context {
 	struct r600_isa		*isa;
 };
 
-static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
+static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 					    struct r600_command_buffer *cb)
 {
-	assert(cs->cdw + cb->num_dw <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw + cb->num_dw <= cs->max_dw);
 	memcpy(cs->buf + cs->cdw, cb->buf, 4 * cb->num_dw);
 	cs->cdw += cb->num_dw;
 }
 
+static inline void r600_set_atom_dirty(struct r600_context *rctx,
+				       struct r600_atom *atom,
+				       bool dirty)
+{
+	unsigned long mask;
+	unsigned int w;
+
+	atom->dirty = dirty;
+
+	assert(atom->id != 0);
+	w = atom->id / R600_DIRTY_ATOM_WORD_BITS;
+	mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS);
+	if (dirty)
+		rctx->dirty_atoms[w] |= mask;
+	else
+		rctx->dirty_atoms[w] &= ~mask;
+}
+
+static inline void r600_mark_atom_dirty(struct r600_context *rctx,
+					struct r600_atom *atom)
+{
+	r600_set_atom_dirty(rctx, atom, true);
+}
+
+static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx,
+						unsigned int id)
+{
+#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ)
+	unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS;
+	unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS;
+	unsigned long bits, mask = (1ul << bit) - 1;
+
+	for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) {
+		bits = rctx->dirty_atoms[w] & ~mask;
+		if (bits == 0)
+			continue;
+		return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits);
+	}
+
+	return R600_NUM_ATOMS;
+#else
+	for (; id < R600_NUM_ATOMS; id++) {
+		bool dirty = !!(rctx->dirty_atoms[id / R600_DIRTY_ATOM_WORD_BITS] &
+			(1ul << (id % R600_DIRTY_ATOM_WORD_BITS)));
+		assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty));
+		if (dirty)
+			break;
+	}
+
+	return id;
+#endif
+}
+
 void r600_trace_emit(struct r600_context *rctx);
 
-static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
+static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
-	atom->dirty = false;
+	r600_set_atom_dirty(rctx, atom, false);
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
 }
 
-static INLINE void r600_set_cso_state(struct r600_cso_state *state, void *cso)
+static inline void r600_set_cso_state(struct r600_context *rctx,
+				      struct r600_cso_state *state, void *cso)
 {
 	state->cso = cso;
-	state->atom.dirty = cso != NULL;
+	r600_set_atom_dirty(rctx, &state->atom, cso != NULL);
 }
 
-static INLINE void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso,
+static inline void r600_set_cso_state_with_cb(struct r600_context *rctx,
+					      struct r600_cso_state *state, void *cso,
 					      struct r600_command_buffer *cb)
 {
 	state->cb = cb;
 	state->atom.num_dw = cb ? cb->num_dw : 0;
-	r600_set_cso_state(state, cso);
+	r600_set_cso_state(rctx, state, cso);
 }
 
 /* compute_memory_pool.c */
@@ -529,11 +589,6 @@ void compute_memory_pool_delete(struct compute_memory_pool* pool);
 struct compute_memory_pool* compute_memory_pool_new(
 	struct r600_screen *rscreen);
 
-/* evergreen_compute.c */
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-                                   unsigned start_slot, unsigned count,
-                                   struct pipe_sampler_view **views);
-
 /* evergreen_state.c */
 struct pipe_sampler_view *
 evergreen_create_sampler_view_custom(struct pipe_context *ctx,
@@ -656,6 +711,7 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a);
+void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id);
 void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw);
@@ -719,19 +775,19 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 /*Evergreen Compute packet3*/
 #define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE)
 
-static INLINE void r600_store_value(struct r600_command_buffer *cb, unsigned value)
+static inline void r600_store_value(struct r600_command_buffer *cb, unsigned value)
 {
 	cb->buf[cb->num_dw++] = value;
 }
 
-static INLINE void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
+static inline void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
 {
 	assert(cb->num_dw+num <= cb->max_num_dw);
 	memcpy(&cb->buf[cb->num_dw], ptr, num * sizeof(ptr[0]));
 	cb->num_dw += num;
 }
 
-static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -743,7 +799,7 @@ static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET && reg < R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -755,7 +811,7 @@ static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, un
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -763,7 +819,7 @@ static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsi
 	cb->buf[cb->num_dw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -775,7 +831,7 @@ static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= EG_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -783,31 +839,31 @@ static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsig
 	cb->buf[cb->num_dw++] = (reg - EG_LOOP_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_config_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_context_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_ctl_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	eg_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
@@ -816,28 +872,28 @@ static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned
 void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw);
 void r600_release_command_buffer(struct r600_command_buffer *cb);
 
-static INLINE void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	r600_write_context_reg_seq(cs, reg, num);
 	/* Set the compute bit on the packet header */
 	cs->buf[cs->cdw - 2] |= RADEON_CP_PACKET3_COMPUTE_MODE;
 }
 
-static INLINE void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CTL_CONST, num, 0);
 	cs->buf[cs->cdw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_compute_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
+static inline void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
 {
 	if (flag & RADEON_CP_PACKET3_COMPUTE_MODE) {
 		r600_write_compute_context_reg(cs, reg, value);
@@ -846,7 +902,7 @@ static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsi
 	}
 }
 
-static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_ctl_const_seq(cs, reg, 1);
 	radeon_emit(cs, value);
@@ -855,21 +911,21 @@ static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned re
 /*
  * common helpers
  */
-static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
+static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
 #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
-static INLINE unsigned r600_pack_float_12p4(float x)
+static inline unsigned r600_pack_float_12p4(float x)
 {
 	return x <= 0    ? 0 :
 	       x >= 4096 ? 0xffff : x * 16;
 }
 
 /* Return if the depth format can be read without the DB->CB copy on r6xx-r7xx. */
-static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
+static inline bool r600_can_read_depth(struct r600_texture *rtex)
 {
 	return rtex->resource.b.b.nr_samples <= 1 &&
 	       (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
@@ -880,7 +936,7 @@ static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
 #define     V_028A6C_OUTPRIM_TYPE_LINESTRIP            1
 #define     V_028A6C_OUTPRIM_TYPE_TRISTRIP             2
 
-static INLINE unsigned r600_conv_prim_to_gs_out(unsigned mode)
+static inline unsigned r600_conv_prim_to_gs_out(unsigned mode)
 {
 	static const int prim_conv[] = {
 		V_028A6C_OUTPRIM_TYPE_POINTLIST,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index af7622e9b34..8d1f95abddc 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -310,6 +310,7 @@ struct r600_shader_ctx {
 	int					gs_next_vertex;
 	struct r600_shader	*gs_for_vs;
 	int					gs_export_gpr_treg;
+	unsigned				enabled_stream_buffers_mask;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -1402,6 +1403,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 		 * with MEM_STREAM instructions */
 		output.array_size = 0xFFF;
 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+
+		ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+
 		if (ctx->bc->chip_class >= EVERGREEN) {
 			switch (so->output[i].output_buffer) {
 			case 0:
@@ -1718,6 +1722,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 	gs->gs_copy_shader = cshader;
 
 	ctx.bc->nstack = 1;
+
+	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	cshader->shader.ring_item_size = ocnt * 16;
 
 	return r600_bytecode_build(ctx.bc);
@@ -1931,15 +1937,14 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
+	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
+
 	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
-		ctx.temp_reg = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4;
+		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 4;
 	} else {
-		ctx.temp_reg = ctx.bc->ar_reg + 1;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 3;
 	}
 
 	shader->max_arrays = 0;
@@ -2086,7 +2091,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
 		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
 		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.clip_vertex = ctx.cv_output;
 		radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
 		radeon_llvm_ctx.has_compressed_msaa_texturing =
 			ctx.bc->has_compressed_msaa_texturing;
@@ -2262,6 +2266,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	    so.num_outputs && !use_llvm)
 		emit_streamout(&ctx, &so);
 
+	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	convert_edgeflag_to_int(&ctx);
 
 	if (ring_outputs) {
@@ -2485,6 +2490,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			output[j].array_base = 0;
 			output[j].op = CF_OP_EXPORT;
 			j++;
+			shader->nr_ps_color_exports++;
 		}
 
 		noutput = j;
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index dd359d7e959..5d05c8153d7 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -125,6 +125,7 @@ struct r600_pipe_shader {
 	struct r600_shader_key	key;
 	unsigned		db_shader_control;
 	unsigned		ps_depth_export;
+	unsigned		enabled_stream_buffers_mask;
 };
 
 /* return the table index 0-5 for TGSI_INTERPOLATE_LINEAR/PERSPECTIVE and
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 960dfcedfef..5cc2283792d 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -473,7 +473,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {
@@ -802,7 +802,7 @@ static void r600_set_scissor_states(struct pipe_context *ctx,
 		return;
 
 	for (i = start_slot ; i < start_slot + num_scissors; i++) {
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1193,7 +1193,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1209,28 +1209,28 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	/* Calculate the CS size. */
@@ -1250,7 +1250,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 2;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1541,9 +1541,9 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->rasterizer_state.atom);
 		if (rctx->b.chip_class == R600)
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
@@ -2089,7 +2089,7 @@ bool r600_adjust_gprs(struct r600_context *rctx)
 	if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp || rctx->config_state.sq_gpr_resource_mgmt_2 != tmp2) {
 		rctx->config_state.sq_gpr_resource_mgmt_1 = tmp;
 		rctx->config_state.sq_gpr_resource_mgmt_2 = tmp2;
-		rctx->config_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->config_state.atom);
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 	}
 	return true;
@@ -2796,11 +2796,11 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
-static INLINE unsigned r600_array_mode(unsigned mode)
+static inline unsigned r600_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_0280A0_ARRAY_LINEAR_ALIGNED;
@@ -3074,8 +3074,8 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 13dc9ee8c10..aa4a8d0240f 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -47,18 +47,26 @@ void r600_release_command_buffer(struct r600_command_buffer *cb)
 	FREE(cb->buf);
 }
 
+void r600_add_atom(struct r600_context *rctx,
+		   struct r600_atom *atom,
+		   unsigned id)
+{
+	assert(id < R600_NUM_ATOMS);
+	assert(rctx->atoms[id] == NULL);
+	rctx->atoms[id] = atom;
+	atom->id = id;
+	atom->dirty = false;
+}
+
 void r600_init_atom(struct r600_context *rctx,
 		    struct r600_atom *atom,
 		    unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw)
 {
-	assert(id < R600_NUM_ATOMS);
-	assert(rctx->atoms[id] == NULL);
-	rctx->atoms[id] = atom;
 	atom->emit = (void*)emit;
 	atom->num_dw = num_dw;
-	atom->dirty = false;
+	r600_add_atom(rctx, atom, id);
 }
 
 void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom)
@@ -127,11 +135,11 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 	rctx->dual_src_blend = blend->dual_src_blend;
 
 	if (!blend_disable) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer);
 		color_control = blend->cb_color_control;
 	} else {
 		/* Blending is disabled. */
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer_no_blend);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer_no_blend);
 		color_control = blend->cb_color_control_no_blend;
 	}
 
@@ -150,7 +158,7 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 		update_cb = true;
 	}
 	if (update_cb) {
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 }
 
@@ -160,7 +168,7 @@ static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
 	struct r600_blend_state *blend = (struct r600_blend_state *)state;
 
 	if (blend == NULL) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, NULL, NULL);
 		return;
 	}
 
@@ -173,7 +181,7 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->blend_color.state = *state;
-	rctx->blend_color.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->blend_color.atom);
 }
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
@@ -210,7 +218,7 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	struct pipe_constant_buffer cb;
 
 	rctx->clip_state.state = *state;
-	rctx->clip_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->clip_state.atom);
 
 	cb.buffer = NULL;
 	cb.user_buffer = state->ucp;
@@ -226,7 +234,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->stencil_ref.state = *state;
-	rctx->stencil_ref.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->stencil_ref.atom);
 }
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
@@ -274,11 +282,11 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	struct r600_stencil_ref ref;
 
 	if (state == NULL) {
-		r600_set_cso_state_with_cb(&rctx->dsa_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, NULL, NULL);
 		return;
 	}
 
-	r600_set_cso_state_with_cb(&rctx->dsa_state, dsa, &dsa->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, dsa, &dsa->buffer);
 
 	ref.ref_value[0] = rctx->stencil_ref.pipe_state.ref_value[0];
 	ref.ref_value[1] = rctx->stencil_ref.pipe_state.ref_value[1];
@@ -293,7 +301,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 			 * we are having lockup on evergreen so do not enable
 			 * hyperz when not writing zbuffer
 			 */
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -304,7 +312,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	    rctx->alphatest_state.sx_alpha_ref != dsa->alpha_ref) {
 		rctx->alphatest_state.sx_alpha_test_control = dsa->sx_alpha_test_control;
 		rctx->alphatest_state.sx_alpha_ref = dsa->alpha_ref;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 }
 
@@ -318,14 +326,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	rctx->rasterizer = rs;
 
-	r600_set_cso_state_with_cb(&rctx->rasterizer_state, rs, &rs->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->rasterizer_state, rs, &rs->buffer);
 
 	if (rs->offset_enable &&
 	    (rs->offset_units != rctx->poly_offset_state.offset_units ||
 	     rs->offset_scale != rctx->poly_offset_state.offset_scale)) {
 		rctx->poly_offset_state.offset_units = rs->offset_units;
 		rctx->poly_offset_state.offset_scale = rs->offset_scale;
-		rctx->poly_offset_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 	}
 
 	/* Update clip_misc_state. */
@@ -333,14 +341,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 	    rctx->clip_misc_state.clip_plane_enable != rs->clip_plane_enable) {
 		rctx->clip_misc_state.pa_cl_clip_cntl = rs->pa_cl_clip_cntl;
 		rctx->clip_misc_state.clip_plane_enable = rs->clip_plane_enable;
-		rctx->clip_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}
 
 	/* Workaround for a missing scissor enable on r600. */
 	if (rctx->b.chip_class == R600 &&
 	    rs->scissor_enable != rctx->scissor[0].enable) {
 		rctx->scissor[0].enable = rs->scissor_enable;
-		rctx->scissor[0].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[0].atom);
 	}
 
 	/* Re-emit PA_SC_LINE_STIPPLE. */
@@ -378,7 +386,7 @@ void r600_sampler_states_dirty(struct r600_context *rctx,
 		state->atom.num_dw =
 			util_bitcount(state->dirty_mask & state->has_bordercolor_mask) * 11 +
 			util_bitcount(state->dirty_mask & ~state->has_bordercolor_mask) * 5;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -399,9 +407,9 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader != PIPE_SHADER_VERTEX &&
-	    shader != PIPE_SHADER_FRAGMENT) {
-		return;
+	if (!states) {
+		disable_mask = ~0u;
+		count = 0;
 	}
 
 	for (i = 0; i < count; i++) {
@@ -443,7 +451,7 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 		/* change in TA_CNTL_AUX need a pipeline flush */
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 		rctx->seamless_cube_map.enabled = seamless_cube_map;
-		rctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->seamless_cube_map.atom);
 	}
 }
 
@@ -483,7 +491,7 @@ static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
-	r600_set_cso_state(&rctx->vertex_fetch_shader, state);
+	r600_set_cso_state(rctx, &rctx->vertex_fetch_shader, state);
 }
 
 static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state)
@@ -513,7 +521,7 @@ void r600_vertex_buffers_dirty(struct r600_context *rctx)
 		rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 		rctx->vertex_buffer_state.atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 12 : 11) *
 					       util_bitcount(rctx->vertex_buffer_state.dirty_mask);
-		rctx->vertex_buffer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vertex_buffer_state.atom);
 	}
 }
 
@@ -570,7 +578,7 @@ void r600_sampler_views_dirty(struct r600_context *rctx,
 		rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 		state->atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 14 : 13) *
 				     util_bitcount(state->dirty_mask);
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -593,9 +601,9 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader == PIPE_SHADER_COMPUTE) {
-		evergreen_set_cs_sampler_view(pipe, start, count, views);
-		return;
+	if (!views) {
+		disable_mask = ~0u;
+		count = 0;
 	}
 
 	remaining_mask = dst->views.enabled_mask & disable_mask;
@@ -673,7 +681,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_viewports; i++) {
 		rctx->viewport[i].state = state[i - start_slot];
-		rctx->viewport[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->viewport[i].atom);
 	}
 }
 
@@ -694,7 +702,7 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -913,7 +921,7 @@ void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf
 		rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
 		state->atom.num_dw = rctx->b.chip_class >= EVERGREEN ? util_bitcount(state->dirty_mask)*20
 								   : util_bitcount(state->dirty_mask)*19;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -982,7 +990,7 @@ static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask
 		return;
 
 	rctx->sample_mask.sample_mask = sample_mask;
-	rctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->sample_mask.atom);
 }
 
 /*
@@ -1107,27 +1115,28 @@ static void update_shader_atom(struct pipe_context *ctx,
 			       struct r600_shader_state *state,
 			       struct r600_pipe_shader *shader)
 {
+	struct r600_context *rctx = (struct r600_context *)ctx;
+
 	state->shader = shader;
 	if (shader) {
 		state->atom.num_dw = shader->command_buffer.num_dw;
-		state->atom.dirty = true;
 		r600_context_add_resource_size(ctx, (struct pipe_resource *)shader->bo);
 	} else {
 		state->atom.num_dw = 0;
-		state->atom.dirty = false;
 	}
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void update_gs_block_state(struct r600_context *rctx, unsigned enable)
 {
 	if (rctx->shader_stages.geom_enable != enable) {
 		rctx->shader_stages.geom_enable = enable;
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 	}
 
 	if (rctx->gs_rings.enable != enable) {
 		rctx->gs_rings.enable = enable;
-		rctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->gs_rings.atom);
 
 		if (enable && !rctx->gs_rings.esgs_ring.buffer) {
 			unsigned size = 0x1C000;
@@ -1192,7 +1201,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (!rctx->shader_stages.geom_enable) {
 			rctx->shader_stages.geom_enable = true;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		/* gs_shader provides GS and VS (copy shader) */
@@ -1206,8 +1215,9 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->gs_shader->current->gs_copy_shader->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->gs_shader->current->gs_copy_shader->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask;
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1223,7 +1233,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 			update_shader_atom(ctx, &rctx->geometry_shader, NULL);
 			update_shader_atom(ctx, &rctx->export_shader, NULL);
 			rctx->shader_stages.geom_enable = false;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1240,8 +1250,9 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->vs_shader->current->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->vs_shader->current->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->vs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->vs_shader->current->enabled_stream_buffers_mask;
 		}
 	}
 
@@ -1252,7 +1263,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) {
 			rctx->cb_misc_state.nr_ps_color_outputs = rctx->ps_shader->current->nr_ps_color_outputs;
-			rctx->cb_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 		}
 
 		if (rctx->b.chip_class <= R700) {
@@ -1260,7 +1271,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 			if (rctx->cb_misc_state.multiwrite != multiwrite) {
 				rctx->cb_misc_state.multiwrite = multiwrite;
-				rctx->cb_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 			}
 		}
 
@@ -1274,7 +1285,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				r600_update_ps_state(ctx, rctx->ps_shader->current);
 		}
 
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		update_shader_atom(ctx, &rctx->pixel_shader, rctx->ps_shader->current);
 	}
 
@@ -1409,7 +1420,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					data += info.indirect_offset / sizeof(unsigned);
 					start = data[2] * ib.index_size;
 					count = data[0];
-					rctx->b.ws->buffer_unmap(indirect_resource->cs_buf);
 				}
 				else {
 					start = 0;
@@ -1454,24 +1464,23 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.vgt_multi_prim_ib_reset_en = info.primitive_restart;
 		rctx->vgt_state.vgt_multi_prim_ib_reset_indx = info.restart_index;
 		rctx->vgt_state.vgt_indx_offset = info.index_bias;
-		rctx->vgt_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vgt_state.atom);
 	}
 
 	/* Workaround for hardware deadlock on certain R600 ASICs: write into a CB register. */
 	if (rctx->b.chip_class == R600) {
 		rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	/* Emit states. */
 	r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
 	r600_flush_emit(rctx);
 
-	for (i = 0; i < R600_NUM_ATOMS; i++) {
-		if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) {
-			continue;
-		}
+	i = r600_next_dirty_atom(rctx, 0);
+	while (i < R600_NUM_ATOMS) {
 		r600_emit_atom(rctx, rctx->atoms[i]);
+		i = r600_next_dirty_atom(rctx, i + 1);
 	}
 
 	if (rctx->b.chip_class == CAYMAN) {
@@ -2490,7 +2499,7 @@ static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable
 
 	if (rctx->db_misc_state.occlusion_query_enabled != enable) {
 		rctx->db_misc_state.occlusion_query_enabled = enable;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index 2e38a62c05a..62680788c5e 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -489,7 +489,7 @@ bool alu_group_tracker::try_reserve(alu_node* n) {
 
 	n->bc.bank_swizzle = 0;
 
-	if (!trans & fbs)
+	if (!trans && fbs)
 		n->bc.bank_swizzle = VEC_210;
 
 	if (gpr.try_reserve(n)) {
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index fc5f6c29870..cb9809f2449 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -84,7 +84,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		}
 	}
 
-	if (busy || ctx->ws->buffer_is_busy(resource->buf, rusage)) {
+	if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			return NULL;
 		} else {
@@ -121,7 +121,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		/* Older kernels didn't always flush the HDP cache before
 		 * CS execution
 		 */
-		if (rscreen->info.drm_minor < 40) {
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40) {
 			res->domains = RADEON_DOMAIN_GTT;
 			flags |= RADEON_FLAG_GTT_WC;
 			break;
@@ -147,7 +148,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		 * Write-combined CPU mappings are fine, the kernel ensures all CPU
 		 * writes finish before the GPU executes a command stream.
 		 */
-		if (rscreen->info.drm_minor < 40)
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40)
 			res->domains = RADEON_DOMAIN_GTT;
 		else if (res->domains & RADEON_DOMAIN_VRAM)
 			flags |= RADEON_FLAG_CPU_ACCESS;
@@ -161,6 +163,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		flags |= RADEON_FLAG_NO_CPU_ACCESS;
 	}
 
+	if (rscreen->debug_flags & DBG_NO_WC)
+		flags &= ~RADEON_FLAG_GTT_WC;
+
 	/* Allocate a new resource. */
 	new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
 					     use_reusable_pool,
@@ -274,7 +279,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
 		}
 		/* At this point, the buffer is always idle. */
@@ -288,7 +293,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
 			struct r600_resource *staging = NULL;
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index b51eebbc68e..03a04b754d6 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -33,7 +33,7 @@
 #include "r600_pipe_common.h"
 #include "r600d_common.h"
 
-static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
+static inline unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 					     struct r600_ring *ring,
 					     struct r600_resource *rbo,
 					     enum radeon_bo_usage usage,
@@ -59,7 +59,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 				      rbo->domains, priority) * 4;
 }
 
-static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
+static inline void r600_emit_reloc(struct r600_common_context *rctx,
 				   struct r600_ring *ring, struct r600_resource *rbo,
 				   enum radeon_bo_usage usage,
 				   enum radeon_bo_priority priority)
@@ -74,57 +74,57 @@ static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
 	}
 }
 
-static INLINE void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_config_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static INLINE void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	si_write_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	cik_write_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3def4446882..ed5d1dabdc3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -108,9 +108,9 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
 	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) {
+	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
 		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS);
+		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
 	}
 }
 
@@ -132,10 +132,11 @@ void r600_preflush_suspend_features(struct r600_common_context *ctx)
 	}
 
 	/* suspend queries */
-	ctx->nontimer_queries_suspended = false;
+	ctx->queries_suspended_for_flush = false;
 	if (ctx->num_cs_dw_nontimer_queries_suspend) {
 		r600_suspend_nontimer_queries(ctx);
-		ctx->nontimer_queries_suspended = true;
+		r600_suspend_timer_queries(ctx);
+		ctx->queries_suspended_for_flush = true;
 	}
 
 	ctx->streamout.suspended = false;
@@ -153,8 +154,9 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (ctx->nontimer_queries_suspended) {
+	if (ctx->queries_suspended_for_flush) {
 		r600_resume_nontimer_queries(ctx);
+		r600_resume_timer_queries(ctx);
 	}
 
 	/* Re-enable render condition. */
@@ -196,6 +198,19 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
 	rctx->rings.dma.flushing = false;
 }
 
+static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	unsigned latest = rctx->ws->query_value(rctx->ws,
+						RADEON_GPU_RESET_COUNTER);
+
+	if (rctx->gpu_reset_counter == latest)
+		return PIPE_NO_RESET;
+
+	rctx->gpu_reset_counter = latest;
+	return PIPE_UNKNOWN_CONTEXT_RESET;
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -222,6 +237,13 @@ bool r600_common_context_init(struct r600_common_context *rctx,
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
 
+	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
+		rctx->b.get_device_reset_status = r600_get_reset_status;
+		rctx->gpu_reset_counter =
+			rctx->ws->query_value(rctx->ws,
+					      RADEON_GPU_RESET_COUNTER);
+	}
+
 	LIST_INITHEAD(&rctx->texture_buffers);
 
 	r600_init_context_texture_functions(rctx);
@@ -240,8 +262,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->uploader)
 		return false;
 
+	rctx->ctx = rctx->ws->ctx_create(rctx->ws);
+	if (!rctx->ctx)
+		return false;
+
 	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ws, RING_DMA,
+		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 							 r600_flush_dma_ring,
 							 rctx, NULL);
 		rctx->rings.dma.flush = r600_flush_dma_ring;
@@ -252,12 +278,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs) {
+	if (rctx->rings.gfx.cs)
 		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	}
-	if (rctx->rings.dma.cs) {
+	if (rctx->rings.dma.cs)
 		rctx->ws->cs_destroy(rctx->rings.dma.cs);
-	}
+	if (rctx->ctx)
+		rctx->ws->ctx_destroy(rctx->ctx);
 
 	if (rctx->uploader) {
 		u_upload_destroy(rctx->uploader);
@@ -313,6 +339,11 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "gs", DBG_GS, "Print geometry shaders" },
 	{ "ps", DBG_PS, "Print pixel shaders" },
 	{ "cs", DBG_CS, "Print compute shaders" },
+	{ "tcs", DBG_TCS, "Print tessellation control shaders" },
+	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
+	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
+	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
+	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -324,6 +355,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "switch_on_eop", DBG_SWITCH_ON_EOP, "Program WD/IA to switch on end-of-packet." },
 	{ "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." },
 	{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
+	{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -338,11 +370,9 @@ static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 	return "AMD";
 }
 
-static const char* r600_get_name(struct pipe_screen* pscreen)
+static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 {
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
-
-	switch (rscreen->family) {
+	switch (rscreen->info.family) {
 	case CHIP_R600: return "AMD R600";
 	case CHIP_RV610: return "AMD RV610";
 	case CHIP_RV630: return "AMD RV630";
@@ -378,10 +408,21 @@ static const char* r600_get_name(struct pipe_screen* pscreen)
 	case CHIP_KABINI: return "AMD KABINI";
 	case CHIP_HAWAII: return "AMD HAWAII";
 	case CHIP_MULLINS: return "AMD MULLINS";
+	case CHIP_TONGA: return "AMD TONGA";
+	case CHIP_ICELAND: return "AMD ICELAND";
+	case CHIP_CARRIZO: return "AMD CARRIZO";
+	case CHIP_FIJI: return "AMD FIJI";
 	default: return "AMD unknown";
 	}
 }
 
+static const char* r600_get_name(struct pipe_screen* pscreen)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
+
+	return rscreen->renderer_string;
+}
+
 static float r600_get_paramf(struct pipe_screen* pscreen,
 			     enum pipe_capf param)
 {
@@ -495,6 +536,10 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 #else
 		return "kabini";
 #endif
+	case CHIP_TONGA: return "tonga";
+	case CHIP_ICELAND: return "iceland";
+	case CHIP_CARRIZO: return "carrizo";
+	case CHIP_FIJI: return "fiji";
 	default: return "";
 	}
 }
@@ -636,6 +681,12 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 		return sizeof(uint32_t);
 	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
 		break; /* unused */
+	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+		if (ret) {
+			uint32_t *subgroup_size = ret;
+			*subgroup_size = r600_wavefront_size(rscreen->family);
+		}
+		return sizeof(uint32_t);
 	}
 
         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
@@ -656,25 +707,33 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pipe_driver_query_info list[] = {
+		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
+		{"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}},
+		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
-		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"GPU-load", R600_QUERY_GPU_LOAD, {100}},
 		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
-		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}},
-		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}},
-		{"GPU-load", R600_QUERY_GPU_LOAD, {100}}
+		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
+		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
 	};
 	unsigned num_queries;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
 		num_queries = Elements(list);
+	else if (rscreen->info.drm_major == 3)
+		num_queries = Elements(list) - 3;
 	else
-		num_queries = 8;
+		num_queries = Elements(list) - 4;
 
 	if (!info)
 		return num_queries;
@@ -695,14 +754,6 @@ static void r600_fence_reference(struct pipe_screen *screen,
 	rws->fence_reference(ptr, fence);
 }
 
-static boolean r600_fence_signalled(struct pipe_screen *screen,
-				    struct pipe_fence_handle *fence)
-{
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r600_fence_finish(struct pipe_screen *screen,
 				 struct pipe_fence_handle *fence,
 				 uint64_t timeout)
@@ -837,8 +888,22 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
 			     struct radeon_winsys *ws)
 {
+	char llvm_string[32] = {};
+
 	ws->query_info(ws, &rscreen->info);
 
+#if HAVE_LLVM
+	snprintf(llvm_string, sizeof(llvm_string),
+		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+#endif
+
+	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
+		 "%s (DRM %i.%i.%i%s)",
+		 r600_get_chip_name(rscreen), rscreen->info.drm_major,
+		 rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
+		 llvm_string);
+
 	rscreen->b.get_name = r600_get_name;
 	rscreen->b.get_vendor = r600_get_vendor;
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
@@ -848,7 +913,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.get_timestamp = r600_get_timestamp;
 	rscreen->b.fence_finish = r600_fence_finish;
 	rscreen->b.fence_reference = r600_fence_reference;
-	rscreen->b.fence_signalled = r600_fence_signalled;
 	rscreen->b.resource_destroy = u_resource_destroy_vtbl;
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
 
@@ -874,7 +938,9 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (rscreen->info.drm_minor >= 28 && (rscreen->debug_flags & DBG_TRACE_CS)) {
+	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
+	     rscreen->info.drm_major == 3) &&
+	    (rscreen->debug_flags & DBG_TRACE_CS)) {
 		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
 										PIPE_BIND_CUSTOM,
 										PIPE_USAGE_STAGING,
@@ -922,10 +988,8 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo) {
-		rscreen->ws->buffer_unmap(rscreen->trace_bo->cs_buf);
+	if (rscreen->trace_bo)
 		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
-	}
 
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
@@ -941,6 +1005,10 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 	switch (tgsi_get_processor_type(tokens)) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return (rscreen->debug_flags & DBG_TCS) != 0;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return (rscreen->debug_flags & DBG_TES) != 0;
 	case TGSI_PROCESSOR_GEOMETRY:
 		return (rscreen->debug_flags & DBG_GS) != 0;
 	case TGSI_PROCESSOR_FRAGMENT:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 6ce81d33ddd..29db1cc4e07 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -59,6 +59,8 @@
 #define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
 #define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
 #define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 #define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
@@ -79,17 +81,23 @@
 #define DBG_GS			(1 << 7)
 #define DBG_PS			(1 << 8)
 #define DBG_CS			(1 << 9)
+#define DBG_TCS			(1 << 10)
+#define DBG_TES			(1 << 11)
+#define DBG_NO_IR		(1 << 12)
+#define DBG_NO_TGSI		(1 << 13)
+#define DBG_NO_ASM		(1 << 14)
+/* Bits 21-31 are reserved for the r600g driver. */
 /* features */
-#define DBG_NO_ASYNC_DMA	(1 << 10)
-#define DBG_NO_HYPERZ		(1 << 11)
-#define DBG_NO_DISCARD_RANGE	(1 << 12)
-#define DBG_NO_2D_TILING	(1 << 13)
-#define DBG_NO_TILING		(1 << 14)
-#define DBG_SWITCH_ON_EOP	(1 << 15)
-#define DBG_FORCE_DMA		(1 << 16)
-#define DBG_PRECOMPILE		(1 << 17)
-#define DBG_INFO		(1 << 18)
-/* The maximum allowed bit is 20. */
+#define DBG_NO_ASYNC_DMA	(1llu << 32)
+#define DBG_NO_HYPERZ		(1llu << 33)
+#define DBG_NO_DISCARD_RANGE	(1llu << 34)
+#define DBG_NO_2D_TILING	(1llu << 35)
+#define DBG_NO_TILING		(1llu << 36)
+#define DBG_SWITCH_ON_EOP	(1llu << 37)
+#define DBG_FORCE_DMA		(1llu << 38)
+#define DBG_PRECOMPILE		(1llu << 39)
+#define DBG_INFO		(1llu << 40)
+#define DBG_NO_WC		(1llu << 41)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -127,9 +135,8 @@ struct radeon_shader_binary {
 	struct radeon_shader_reloc *relocs;
 	unsigned reloc_count;
 
-	/** Set to 1 if the disassembly for this binary has been dumped to
-	 *  stderr. */
-	int disassembled;
+	/** Disassembled shader in a string. */
+	char *disasm_string;
 };
 
 struct r600_resource {
@@ -214,7 +221,6 @@ struct r600_texture {
 	float				depth_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
-	unsigned			mipmap_shift;
 };
 
 struct r600_surface {
@@ -236,6 +242,7 @@ struct r600_surface {
 	unsigned cb_color_pitch;	/* EG and later */
 	unsigned cb_color_slice;	/* EG and later */
 	unsigned cb_color_attrib;	/* EG and later */
+	unsigned cb_dcc_control;	/* VI and later */
 	unsigned cb_color_fmask;	/* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */
 	unsigned cb_color_fmask_slice;	/* EG and later */
 	unsigned cb_color_cmask;	/* CB_COLORn_TILE (r600 only) */
@@ -272,7 +279,7 @@ struct r600_common_screen {
 	enum chip_class			chip_class;
 	struct radeon_info		info;
 	struct r600_tiling_info		tiling_info;
-	unsigned			debug_flags;
+	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
 
@@ -285,12 +292,23 @@ struct r600_common_screen {
 	uint32_t			*trace_ptr;
 	unsigned			cs_count;
 
+	/* This must be in the screen, because UE4 uses one context for
+	 * compilation and another one for rendering.
+	 */
+	unsigned			num_compilations;
+	/* Along with ST_DEBUG=precompile, this should show if applications
+	 * are loading shaders on demand. This is a monotonic counter.
+	 */
+	unsigned			num_shaders_created;
+
 	/* GPU load thread. */
 	pipe_mutex			gpu_load_mutex;
 	pipe_thread			gpu_load_thread;
 	unsigned			gpu_load_counter_busy;
 	unsigned			gpu_load_counter_idle;
-	unsigned			gpu_load_stop_thread; /* bool */
+	volatile unsigned		gpu_load_stop_thread; /* bool */
+
+	char				renderer_string[64];
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -298,6 +316,7 @@ struct r600_common_screen {
 struct r600_atom {
 	void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
 	unsigned		num_dw;
+	unsigned short		id;	/* used by r600 only */
 	bool			dirty;
 };
 
@@ -327,6 +346,10 @@ struct r600_streamout {
 	/* External state which comes from the vertex shader,
 	 * it must be set explicitly when binding a shader. */
 	unsigned			*stride_in_dw;
+	unsigned			enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
+
+	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+	unsigned			hw_enabled_mask;
 
 	/* The state of VGT_STRMOUT_(CONFIG|EN). */
 	struct r600_atom		enable_atom;
@@ -352,10 +375,12 @@ struct r600_common_context {
 
 	struct r600_common_screen	*screen;
 	struct radeon_winsys		*ws;
+	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct r600_rings		rings;
 	unsigned			initial_gfx_cs_size;
+	unsigned			gpu_reset_counter;
 
 	struct u_upload_mgr		*uploader;
 	struct u_suballocator		*allocator_so_filled_size;
@@ -376,11 +401,14 @@ struct r600_common_context {
 	int				num_occlusion_queries;
 	/* Keep track of non-timer queries, because they should be suspended
 	 * during context flushing.
-	 * The timer queries (TIME_ELAPSED) shouldn't be suspended. */
+	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
+	 * but they should be suspended between IBs. */
 	struct list_head		active_nontimer_queries;
+	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
+	unsigned			num_cs_dw_timer_queries_suspend;
 	/* If queries have been suspended. */
-	bool				nontimer_queries_suspended;
+	bool				queries_suspended_for_flush;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
@@ -441,6 +469,9 @@ struct r600_common_context {
 	/* This ensures there is enough space in the command stream. */
 	void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw,
 				  bool include_draw_vbo);
+
+	void (*set_atom_dirty)(struct r600_common_context *ctx,
+			       struct r600_atom *atom, bool dirty);
 };
 
 /* r600_buffer.c */
@@ -495,6 +526,8 @@ unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 void r600_query_init(struct r600_common_context *rctx);
 void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
 void r600_resume_nontimer_queries(struct r600_common_context *ctx);
+void r600_suspend_timer_queries(struct r600_common_context *ctx);
+void r600_resume_timer_queries(struct r600_common_context *ctx);
 void r600_query_init_backend_mask(struct r600_common_context *ctx);
 
 /* r600_streamout.c */
@@ -549,12 +582,12 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 
 /* Inline helpers. */
 
-static INLINE struct r600_resource *r600_resource(struct pipe_resource *r)
+static inline struct r600_resource *r600_resource(struct pipe_resource *r)
 {
 	return (struct r600_resource*)r;
 }
 
-static INLINE void
+static inline void
 r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
 {
 	pipe_resource_reference((struct pipe_resource **)ptr,
@@ -570,6 +603,26 @@ static inline unsigned r600_tex_aniso_filter(unsigned filter)
 	 /* else */        return 4;
 }
 
+static inline unsigned r600_wavefront_size(enum radeon_family family)
+{
+	switch (family) {
+	case CHIP_RV610:
+	case CHIP_RS780:
+	case CHIP_RV620:
+	case CHIP_RS880:
+		return 16;
+	case CHIP_RV630:
+	case CHIP_RV635:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_PALM:
+	case CHIP_CEDAR:
+		return 32;
+	default:
+		return 64;
+	}
+}
+
 #define COMPUTE_DBG(rscreen, fmt, args...) \
 	do { \
 		if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 71f4a1522f9..7057aa19a7c 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -54,6 +54,8 @@ struct r600_query {
 	uint64_t end_result;
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
+	/* For transform feedback: which stream the query is for */
+	unsigned stream;
 };
 
 
@@ -90,6 +92,8 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		return NULL;
 	}
 
@@ -118,7 +122,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 			}
 			results += 4 * ctx->max_db;
 		}
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 	case PIPE_QUERY_TIMESTAMP:
@@ -130,7 +133,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
 		memset(results, 0, buf_size);
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	default:
 		assert(0);
@@ -157,6 +159,17 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	}
 }
 
+static unsigned event_type_for_stream(struct r600_query *query)
+{
+	switch (query->stream) {
+	default:
+	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
+	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
+	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
+	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
+	}
+}
+
 static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
@@ -191,7 +204,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -215,9 +228,10 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_MIN);
 
-	if (!r600_is_timer_query(query->type)) {
+	if (r600_is_timer_query(query->type))
+		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
+	else
 		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
-	}
 }
 
 static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
@@ -248,7 +262,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		va += query->buffer.results_end + query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -279,9 +293,10 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	query->buffer.results_end += query->result_size;
 
 	if (r600_query_needs_begin(query->type)) {
-		if (!r600_is_timer_query(query->type)) {
+		if (r600_is_timer_query(query->type))
+			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
+		else
 			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
-		}
 	}
 
 	r600_update_occlusion_query_state(ctx, query->type, -1);
@@ -292,6 +307,13 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 					int operation, bool flag_wait)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	uint32_t op = PRED_OP(operation);
+
+	/* if true then invert, see GL_ARB_conditional_render_inverted */
+	if (ctx->current_render_cond_cond)
+		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+	else
+		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
 
 	if (operation == PREDICATION_OP_CLEAR) {
 		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
@@ -302,24 +324,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 	} else {
 		struct r600_query_buffer *qbuf;
 		unsigned count;
-		uint32_t op;
-
 		/* Find how many results there are. */
 		count = 0;
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			count += qbuf->results_end / query->result_size;
 		}
-
+	
 		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-
-		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
-				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
-
+	
+		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+	
 		/* emit predicate packets for all data blocks */
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			unsigned results_base = 0;
 			uint64_t va = qbuf->buf->gpu_address;
-
+	
 			while (results_base < qbuf->results_end) {
 				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
 				radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
@@ -327,7 +346,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
 						RADEON_PRIO_MIN);
 				results_base += query->result_size;
-
+	
 				/* set CONTINUE bit for all packets except the first */
 				op |= PREDICATION_CONTINUE;
 			}
@@ -369,6 +388,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
 		query->result_size = 32;
 		query->num_cs_dw = 6;
+		query->stream = index;
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		/* 11 values on EG, 8 on R600. */
@@ -390,6 +410,8 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		skip_allocation = true;
 		break;
 	default:
@@ -454,7 +476,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 		rquery->begin_result = 0;
 		return true;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return true;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
@@ -465,6 +487,12 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 	case R600_QUERY_GPU_LOAD:
 		rquery->begin_result = r600_gpu_load_begin(rctx->screen);
 		return true;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		return true;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return true;
 	}
 
 	/* Discard the old query buffers. */
@@ -477,7 +505,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
 	if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-	    rctx->ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) {
+	    !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
 		pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
 		rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
 	}
@@ -487,9 +515,10 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	r600_emit_query_begin(rctx, rquery);
 
-	if (!r600_is_timer_query(rquery->type)) {
+	if (r600_is_timer_query(rquery->type))
+		LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
+	else
 		LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
-	}
    return true;
 }
 
@@ -515,7 +544,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
 		return;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
@@ -541,13 +570,18 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case R600_QUERY_GPU_LOAD:
 		rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
 		return;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
+		return;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return;
 	}
 
 	r600_emit_query_end(rctx, rquery);
 
-	if (r600_query_needs_begin(rquery->type) && !r600_is_timer_query(rquery->type)) {
+	if (r600_query_needs_begin(rquery->type))
 		LIST_DELINIT(&rquery->list);
-	}
 }
 
 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
@@ -601,6 +635,8 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		result->u64 = query->end_result - query->begin_result;
 		return TRUE;
 	case R600_QUERY_GPU_LOAD:
@@ -751,7 +787,6 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 		assert(0);
 	}
 
-	ctx->ws->buffer_unmap(qbuf->buf->cs_buf);
 	return TRUE;
 }
 
@@ -823,22 +858,37 @@ static void r600_render_condition(struct pipe_context *ctx,
 	}
 }
 
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+static void r600_suspend_queries(struct r600_common_context *ctx,
+				 struct list_head *query_list,
+				 unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_end(ctx, query);
 	}
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
+}
+
+void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
+			     &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_suspend_timer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_timer_queries,
+			     &ctx->num_cs_dw_timer_queries_suspend);
 }
 
-static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx)
+static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
+						    struct list_head *query_list)
 {
 	struct r600_query *query;
 	unsigned num_dw = 0;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		/* begin + end */
 		num_dw += query->num_cs_dw * 2;
 
@@ -857,21 +907,35 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
 	return num_dw;
 }
 
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+static void r600_resume_queries(struct r600_common_context *ctx,
+				struct list_head *query_list,
+				unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
+	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
 
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
 
 	/* Check CS space here. Resuming must not be interrupted by flushes. */
-	ctx->need_gfx_cs_space(&ctx->b,
-			       r600_queries_num_cs_dw_for_resuming(ctx), TRUE);
+	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_begin(ctx, query);
 	}
 }
 
+void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_nontimer_queries,
+			    &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_resume_timer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_timer_queries,
+			    &ctx->num_cs_dw_timer_queries_suspend);
+}
+
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
@@ -919,7 +983,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
 	if (results) {
 		memset(results, 0, ctx->max_db * 4 * 4);
-		ctx->ws->buffer_unmap(buffer->cs_buf);
 
 		/* emit EVENT_WRITE for ZPASS_DONE */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -937,7 +1000,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 				if (results[i*4 + 1])
 					mask |= (1<<i);
 			}
-			ctx->ws->buffer_unmap(buffer->cs_buf);
 		}
 	}
 
@@ -966,4 +1028,5 @@ void r600_query_init(struct r600_common_context *rctx)
 	    rctx->b.render_condition = r600_render_condition;
 
 	LIST_INITHEAD(&rctx->active_nontimer_queries);
+	LIST_INITHEAD(&rctx->active_timer_queries);
 }
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index bc8bf97ef89..0853f636a27 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -88,8 +88,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		12 + /* flush_vgt_streamout */
 		num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */
 
-	begin->num_dw = 12 + /* flush_vgt_streamout */
-			3; /* VGT_STRMOUT_BUFFER_CONFIG */
+	begin->num_dw = 12; /* flush_vgt_streamout */
 
 	if (rctx->chip_class >= SI) {
 		begin->num_dw += num_bufs * 4; /* SET_CONTEXT_REG */
@@ -105,7 +104,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		(num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */
 		(rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */
 
-	begin->dirty = true;
+	rctx->set_atom_dirty(rctx, begin, true);
 
 	r600_set_streamout_enable(rctx, true);
 }
@@ -146,7 +145,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 	if (num_targets) {
 		r600_streamout_buffers_dirty(rctx);
 	} else {
-		rctx->streamout.begin_atom.dirty = false;
+		rctx->set_atom_dirty(rctx, &rctx->streamout.begin_atom, false);
 		r600_set_streamout_enable(rctx, false);
 	}
 }
@@ -192,11 +191,6 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 	r600_flush_vgt_streamout(rctx);
 
-	r600_write_context_reg(cs, rctx->chip_class >= EVERGREEN ?
-				       R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
-				       R_028B20_VGT_STRMOUT_BUFFER_EN,
-			       rctx->streamout.enabled_mask);
-
 	for (i = 0; i < rctx->streamout.num_targets; i++) {
 		if (!t[i])
 			continue;
@@ -326,20 +320,42 @@ static bool r600_get_strmout_en(struct r600_common_context *rctx)
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
-	r600_write_context_reg(rctx->rings.gfx.cs,
-			       rctx->chip_class >= EVERGREEN ?
-				       R_028B94_VGT_STRMOUT_CONFIG :
-				       R_028AB0_VGT_STRMOUT_EN,
-			       S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)));
+	unsigned strmout_config_reg = R_028AB0_VGT_STRMOUT_EN;
+	unsigned strmout_config_val = S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx));
+	unsigned strmout_buffer_reg = R_028B20_VGT_STRMOUT_BUFFER_EN;
+	unsigned strmout_buffer_val = rctx->streamout.hw_enabled_mask &
+				      rctx->streamout.enabled_stream_buffers_mask;
+
+	if (rctx->chip_class >= EVERGREEN) {
+		strmout_buffer_reg = R_028B98_VGT_STRMOUT_BUFFER_CONFIG;
+
+		strmout_config_reg = R_028B94_VGT_STRMOUT_CONFIG;
+		strmout_config_val |=
+			S_028B94_RAST_STREAM(0) |
+			S_028B94_STREAMOUT_1_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
+	}
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
 {
 	bool old_strmout_en = r600_get_strmout_en(rctx);
+	unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask;
 
 	rctx->streamout.streamout_enabled = enable;
-	if (old_strmout_en != r600_get_strmout_en(rctx))
-		rctx->streamout.enable_atom.dirty = true;
+
+	rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask |
+					  (rctx->streamout.enabled_mask << 4) |
+					  (rctx->streamout.enabled_mask << 8) |
+					  (rctx->streamout.enabled_mask << 12);
+
+	if ((old_strmout_en != r600_get_strmout_en(rctx)) ||
+            (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask)) {
+		rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+	}
 }
 
 void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
@@ -354,8 +370,9 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
 		rctx->streamout.prims_gen_query_enabled =
 			rctx->streamout.num_prims_gen_queries != 0;
 
-		if (old_strmout_en != r600_get_strmout_en(rctx))
-			rctx->streamout.enable_atom.dirty = true;
+		if (old_strmout_en != r600_get_strmout_en(rctx)) {
+			rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+		}
 	}
 }
 
@@ -365,5 +382,5 @@ void r600_streamout_init(struct r600_common_context *rctx)
 	rctx->b.stream_output_target_destroy = r600_so_target_destroy;
 	rctx->streamout.begin_atom.emit = r600_emit_streamout_begin;
 	rctx->streamout.enable_atom.emit = r600_emit_streamout_enable;
-	rctx->streamout.enable_atom.num_dw = 3;
+	rctx->streamout.enable_atom.num_dw = 6;
 }
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index dc510c99749..54696910e43 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -243,10 +243,11 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
 				       surface->level[0].mode >= RADEON_SURF_MODE_2D ?
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
+				       surface->pipe_config,
 				       surface->bankw, surface->bankh,
 				       surface->tile_split,
 				       surface->stencil_tile_split,
-				       surface->mtilea,
+				       surface->mtilea, surface->num_banks,
 				       surface->level[0].pitch_bytes,
 				       (surface->flags & RADEON_SURF_SCANOUT) != 0);
 
@@ -489,7 +490,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	unsigned num_pipes = rscreen->tiling_info.num_channels;
 
 	if (rscreen->chip_class <= EVERGREEN &&
-	    rscreen->info.drm_minor < 26)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
 		return 0;
 
 	/* HW bug on R6xx. */
@@ -501,7 +502,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	/* HTILE is broken with 1D tiling on old kernels and CIK. */
 	if (rscreen->chip_class >= CIK &&
 	    rtex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-	    rscreen->info.drm_minor < 38)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
 	switch (num_pipes) {
@@ -706,6 +707,7 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 				   const struct pipe_resource *templ)
 {
 	const struct util_format_description *desc = util_format_description(templ->format);
+	bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING;
 
 	/* MSAA resources must be 2D tiled. */
 	if (templ->nr_samples > 1)
@@ -715,10 +717,16 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 	if (templ->flags & R600_RESOURCE_FLAG_TRANSFER)
 		return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
+	/* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */
+	if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN &&
+	    (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) &&
+	    (templ->target == PIPE_TEXTURE_2D ||
+	     templ->target == PIPE_TEXTURE_3D))
+		force_tiling = true;
+
 	/* Handle common candidates for the linear mode.
 	 * Compressed textures must always be tiled. */
-	if (!(templ->flags & R600_RESOURCE_FLAG_FORCE_TILING) &&
-	    !util_format_is_compressed(templ->format)) {
+	if (!force_tiling && !util_format_is_compressed(templ->format)) {
 		/* Not everything can be linear, so we cannot enforce it
 		 * for all textures. */
 		if ((rscreen->debug_flags & DBG_NO_TILING) &&
@@ -934,7 +942,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		use_staging_texture = TRUE;
 	} else if (!(usage & PIPE_TRANSFER_READ) &&
 	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
-	     rctx->ws->buffer_is_busy(rtex->resource.buf, RADEON_USAGE_READWRITE))) {
+	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
 		/* Use a staging texture for uploads if the underlying BO is busy. */
 		use_staging_texture = TRUE;
 	}
@@ -1059,18 +1067,9 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 					struct pipe_transfer* transfer)
 {
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
-	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
-	struct radeon_winsys_cs_handle *buf;
 	struct pipe_resource *texture = transfer->resource;
 	struct r600_texture *rtex = (struct r600_texture*)texture;
 
-	if (rtransfer->staging) {
-		buf = rtransfer->staging->cs_buf;
-	} else {
-		buf = r600_resource(transfer->resource)->cs_buf;
-	}
-	rctx->ws->buffer_unmap(buf);
-
 	if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) {
 		if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) {
 			ctx->resource_copy_region(ctx, texture, transfer->level,
@@ -1262,7 +1261,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
 		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-		    rctx->chip_class >= CIK && rctx->screen->info.drm_minor < 38) {
+		    rctx->chip_class >= CIK &&
+		    rctx->screen->info.drm_major == 2 &&
+		    rctx->screen->info.drm_minor < 38) {
 			continue;
 		}
 
@@ -1278,7 +1279,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   tex->cmask.offset, tex->cmask.size, 0, true);
 
 		tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
-		fb_state->dirty = true;
+		rctx->set_atom_dirty(rctx, fb_state, true);
 		*buffers &= ~clear_bit;
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index 74c8d8782a6..115042d153e 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -66,6 +66,9 @@
 #define PKT3_SET_SH_REG                        0x76 /* SI and later */
 #define PKT3_SET_UCONFIG_REG                   0x79 /* CIK and later */
 
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS1      0x1 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS2      0x2 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS3      0x3 /* EG and later */
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15
@@ -177,7 +180,7 @@
 #define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
 #define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
 #define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x7) << 24)
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x07) << 24)
 #define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)	(((x) & 0x1) << 27)
 #define CM_R_028BDC_PA_SC_LINE_CNTL                  0x28bdc
 #define   S_028BDC_EXPAND_LINE_WIDTH(x)                (((x) & 0x1) << 9)
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c b/src/gallium/drivers/radeon/radeon_elf_util.c
index 9b508227fd4..2e45d439e7a 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.c
+++ b/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -103,8 +103,7 @@ static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 }
 
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-					struct radeon_shader_binary *binary,
-					unsigned debug)
+		     struct radeon_shader_binary *binary)
 {
 	char *elf_buffer;
 	Elf *elf;
@@ -124,7 +123,6 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 	elf = elf_memory(elf_buffer, elf_size);
 
 	elf_getshdrstrndx(elf, &section_str_index);
-	binary->disassembled = 0;
 
 	while ((section = elf_nextscn(elf, section))) {
 		const char *name;
@@ -145,12 +143,11 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 			binary->config_size = section_data->d_size;
 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 			memcpy(binary->config, section_data->d_buf, binary->config_size);
-		} else if (debug && !strcmp(name, ".AMDGPU.disasm")) {
-			binary->disassembled = 1;
+		} else if (!strcmp(name, ".AMDGPU.disasm")) {
+			/* Always read disassembly if it's available. */
 			section_data = elf_getdata(section, section_data);
-			fprintf(stderr, "\nShader Disassembly:\n\n");
-			fprintf(stderr, "%.*s\n", (int)section_data->d_size,
-						  (char *)section_data->d_buf);
+			binary->disasm_string = strndup(section_data->d_buf,
+							section_data->d_size);
 		} else if (!strncmp(name, ".rodata", 7)) {
 			section_data = elf_getdata(section, section_data);
 			binary->rodata_size = section_data->d_size;
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h b/src/gallium/drivers/radeon/radeon_elf_util.h
index ab83f98ea69..ea4ab2f14b2 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.h
+++ b/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -37,7 +37,7 @@ struct radeon_shader_reloc;
  * radeon_shader_binary object.
  */
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-		struct radeon_shader_binary *binary, unsigned debug);
+		     struct radeon_shader_binary *binary);
 
 /**
  * @returns A pointer to the start of the configuration information for
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 6a9557b0b73..e967ad2214e 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -58,7 +58,6 @@ struct radeon_llvm_context {
 	unsigned type;
 	unsigned face_gpr;
 	unsigned two_side;
-	unsigned clip_vertex;
 	unsigned inputs_count;
 	struct r600_shader_io * r600_inputs;
 	struct r600_shader_io * r600_outputs;
@@ -72,21 +71,6 @@ struct radeon_llvm_context {
 
 	/*=== Front end configuration ===*/
 
-	/* Special Intrinsics */
-
-	/** Write to an output register: float store_output(float, i32) */
-	const char * store_output_intr;
-
-	/** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32)
-	 * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value
-	 * in 2-bits.
-	 * Swizzle{0-1} = X Channel
-	 * Swizzle{2-3} = Y Channel
-	 * Swizzle{4-5} = Z Channel
-	 * Swizzle{6-7} = W Channel
-	 */
-	const char * swizzle_intr;
-
 	/* Instructions that are not described by any of the TGSI opcodes. */
 
 	/** This function is responsible for initilizing the inputs array and will be
@@ -100,9 +84,6 @@ struct radeon_llvm_context {
 			unsigned index,
 			const struct tgsi_full_declaration *decl);
 
-	/** User data to use with the callbacks */
-	void * userdata;
-
 	/** This array contains the input values for the shader.  Typically these
 	  * values will be in the form of a target intrinsic that will inform the
 	  * backend how to load the actual inputs to the shader. 
@@ -146,6 +127,8 @@ static inline LLVMTypeRef tgsi2llvmtype(
 	case TGSI_TYPE_UNSIGNED:
 	case TGSI_TYPE_SIGNED:
 		return LLVMInt32TypeInContext(ctx);
+	case TGSI_TYPE_DOUBLE:
+		return LLVMDoubleTypeInContext(ctx);
 	case TGSI_TYPE_UNTYPED:
 	case TGSI_TYPE_FLOAT:
 		return LLVMFloatTypeInContext(ctx);
@@ -171,8 +154,9 @@ static inline LLVMValueRef bitcast(
 
 
 void radeon_llvm_emit_prepare_cube_coords(struct lp_build_tgsi_context * bld_base,
-                                          struct lp_build_emit_data * emit_data,
-                                          LLVMValueRef *coords_arg);
+					  struct lp_build_emit_data * emit_data,
+					  LLVMValueRef *coords_arg,
+					  LLVMValueRef *derivs_arg);
 
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
 
@@ -191,20 +175,29 @@ unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan);
 
 void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx);
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-		const char *name,
-		LLVMTypeRef ret_type,
-		LLVMValueRef *args,
-		unsigned num_args,
-		LLVMAttribute attr);
-
 void
 build_tgsi_intrinsic_nomem(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data);
 
-
+LLVMValueRef
+radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base,
+			      LLVMValueRef ptr,
+			      LLVMValueRef ptr2);
+
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value);
+
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle);
+
+void radeon_llvm_emit_store(
+	struct lp_build_tgsi_context * bld_base,
+	const struct tgsi_full_instruction * inst,
+	const struct tgsi_opcode_info * info,
+	LLVMValueRef dst[4]);
 
 #endif /* RADEON_LLVM_H */
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 25580b6bd4c..00025590137 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -62,6 +62,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 
 	switch (type) {
 	case TGSI_PROCESSOR_VERTEX:
+	case TGSI_PROCESSOR_TESS_CTRL:
+	case TGSI_PROCESSOR_TESS_EVAL:
 		llvm_type = RADEON_LLVM_SHADER_VS;
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
@@ -142,7 +144,8 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			  const char *gpu_family, unsigned dump, LLVMTargetMachineRef tm)
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm)
 {
 
 	char cpu[CPU_STRING_LEN];
@@ -165,17 +168,15 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump) {
+		if (dump_asm)
 			strncpy(fs, "+DumpCode", FS_STRING_LEN);
-		}
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump) {
+	if (dump_ir)
 		LLVMDumpModule(M);
-	}
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
@@ -204,7 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
 
-	radeon_elf_read(buffer_data, buffer_size, binary, dump);
+	radeon_elf_read(buffer_data, buffer_size, binary);
 
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 3ccef78e36d..e20aed94c6b 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -29,6 +29,7 @@
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
+#include <stdbool.h>
 
 struct radeon_shader_binary;
 
@@ -36,11 +37,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
-unsigned  radeon_llvm_compile(
-	LLVMModuleRef M,
-	struct radeon_shader_binary *binary,
-	const char * gpu_family,
-	unsigned dump,
-	LLVMTargetMachineRef tm);
+unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index c8c980d9d32..56694700a47 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -109,12 +109,27 @@ emit_array_index(
 	return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
 }
 
-static LLVMValueRef
-emit_fetch(
+LLVMValueRef
+radeon_llvm_emit_fetch_double(
 	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle);
+	LLVMValueRef ptr,
+	LLVMValueRef ptr2)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef result;
+
+	result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr),
+					bld_base->int_bld.zero, "");
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr2),
+					bld_base->int_bld.one, "");
+	return bitcast(bld_base, TGSI_TYPE_DOUBLE, result);
+}
 
 static LLVMValueRef
 emit_array_fetch(
@@ -136,7 +151,7 @@ emit_array_fetch(
 
 	for (i = 0; i < size; ++i) {
 		tmp_reg.Register.Index = i + range.First;
-		LLVMValueRef temp = emit_fetch(bld_base, &tmp_reg, type, swizzle);
+		LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
 		result = LLVMBuildInsertElement(builder, result, temp,
 			lp_build_const_int32(gallivm, i), "");
 	}
@@ -150,23 +165,21 @@ static bool uses_temp_indirect_addressing(
 	return (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
 }
 
-static LLVMValueRef
-emit_fetch(
-	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle)
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
 	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	LLVMValueRef result = NULL, ptr;
+	LLVMValueRef result = NULL, ptr, ptr2;
 
 	if (swizzle == ~0) {
 		LLVMValueRef values[TGSI_NUM_CHANNELS];
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			values[chan] = emit_fetch(bld_base, reg, type, chan);
+			values[chan] = radeon_llvm_emit_fetch(bld_base, reg, type, chan);
 		}
 		return lp_build_gather_values(bld_base->base.gallivm, values,
 					      TGSI_NUM_CHANNELS);
@@ -184,11 +197,27 @@ emit_fetch(
 	switch(reg->Register.File) {
 	case TGSI_FILE_IMMEDIATE: {
 		LLVMTypeRef ctype = tgsi2llvmtype(bld_base, type);
-		return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		if (type == TGSI_TYPE_DOUBLE) {
+			result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle],
+							bld_base->int_bld.zero);
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle + 1],
+							bld_base->int_bld.one);
+			return LLVMConstBitCast(result, ctype);
+		} else {
+			return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		}
 	}
 
 	case TGSI_FILE_INPUT:
 		result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr = result;
+			ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
+			return radeon_llvm_emit_fetch_double(bld_base, ptr, ptr2);
+		}
 		break;
 
 	case TGSI_FILE_TEMPORARY:
@@ -199,11 +228,23 @@ emit_fetch(
 			break;
 		}
 		ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1];
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
 	case TGSI_FILE_OUTPUT:
 		ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle);
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = lp_get_output_ptr(bld, reg->Register.Index, swizzle + 1);
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
@@ -321,8 +362,8 @@ static void emit_declaration(
 	}
 }
 
-static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
-                                         LLVMValueRef value)
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value)
 {
 	struct lp_build_emit_data clamp_emit_data;
 
@@ -336,8 +377,7 @@ static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
 				  &clamp_emit_data);
 }
 
-static void
-emit_store(
+void radeon_llvm_emit_store(
 	struct lp_build_tgsi_context * bld_base,
 	const struct tgsi_full_instruction * inst,
 	const struct tgsi_opcode_info * info,
@@ -348,9 +388,10 @@ emit_store(
 	struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-	LLVMValueRef temp_ptr;
+	LLVMValueRef temp_ptr, temp_ptr2 = NULL;
 	unsigned chan, chan_index;
 	boolean is_vec_store = FALSE;
+	enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
 
 	if (dst[0]) {
 		LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0]));
@@ -371,6 +412,8 @@ emit_store(
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 		LLVMValueRef value = dst[chan_index];
 
+		if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+			continue;
 		if (inst->Instruction.Saturate)
 			value = radeon_llvm_saturate(bld_base, value);
 
@@ -379,8 +422,9 @@ emit_store(
 			LLVMBuildStore(builder, value, temp_ptr);
 			continue;
 		}
-	
-		value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+
+		if (dtype != TGSI_TYPE_DOUBLE)
+			value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
 
 		if (reg->Register.Indirect) {
 			struct tgsi_declaration_range range = get_array_range(bld_base,
@@ -418,6 +462,8 @@ emit_store(
 			switch(reg->Register.File) {
 			case TGSI_FILE_OUTPUT:
 				temp_ptr = bld->outputs[reg->Register.Index][chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = bld->outputs[reg->Register.Index][chan_index + 1];
 				break;
 
 			case TGSI_FILE_TEMPORARY:
@@ -428,12 +474,28 @@ emit_store(
 					break;
 				}
 				temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
+
 				break;
 
 			default:
 				return;
 			}
-			LLVMBuildStore(builder, value, temp_ptr);
+			if (dtype != TGSI_TYPE_DOUBLE)
+				LLVMBuildStore(builder, value, temp_ptr);
+			else {
+				LLVMValueRef ptr = LLVMBuildBitCast(builder, value,
+								    LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), 2), "");
+				LLVMValueRef val2;
+				value = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.zero, "");
+				val2 = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.one, "");
+
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, value), temp_ptr);
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, val2), temp_ptr2);
+			}
 		}
 	}
 }
@@ -686,34 +748,26 @@ static void kil_emit(
 	}
 }
 
-void radeon_llvm_emit_prepare_cube_coords(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data,
-		LLVMValueRef *coords_arg)
+static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base,
+					  LLVMValueRef *in, LLVMValueRef *out)
 {
-
-	unsigned target = emit_data->inst->Texture.Texture;
-	unsigned opcode = emit_data->inst->Instruction.Opcode;
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	LLVMTypeRef type = bld_base->base.elem_type;
 	LLVMValueRef coords[4];
 	LLVMValueRef mad_args[3];
-	LLVMValueRef idx;
-	struct LLVMOpaqueValue *cube_vec;
-	LLVMValueRef v;
+	LLVMValueRef v, cube_vec;
 	unsigned i;
 
-	cube_vec = lp_build_gather_values(bld_base->base.gallivm, coords_arg, 4);
-	v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
+	cube_vec = lp_build_gather_values(bld_base->base.gallivm, in, 4);
+	v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
                             &cube_vec, 1, LLVMReadNoneAttribute);
 
-	for (i = 0; i < 4; ++i) {
-		idx = lp_build_const_int32(gallivm, i);
-		coords[i] = LLVMBuildExtractElement(builder, v, idx, "");
-	}
+	for (i = 0; i < 4; ++i)
+		coords[i] = LLVMBuildExtractElement(builder, v,
+						    lp_build_const_int32(gallivm, i), "");
 
-	coords[2] = build_intrinsic(builder, "fabs",
+	coords[2] = lp_build_intrinsic(builder, "llvm.fabs.f32",
 			type, &coords[2], 1, LLVMReadNoneAttribute);
 	coords[2] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_RCP, coords[2]);
 
@@ -729,10 +783,60 @@ void radeon_llvm_emit_prepare_cube_coords(
 			mad_args[0], mad_args[1], mad_args[2]);
 
 	/* apply xyz = yxw swizzle to cooords */
-	coords[2] = coords[3];
-	coords[3] = coords[1];
-	coords[1] = coords[0];
-	coords[0] = coords[3];
+	out[0] = coords[1];
+	out[1] = coords[0];
+	out[2] = coords[3];
+}
+
+void radeon_llvm_emit_prepare_cube_coords(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data,
+		LLVMValueRef *coords_arg,
+		LLVMValueRef *derivs_arg)
+{
+
+	unsigned target = emit_data->inst->Texture.Texture;
+	unsigned opcode = emit_data->inst->Instruction.Opcode;
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef coords[4];
+	unsigned i;
+
+	radeon_llvm_cube_to_2d_coords(bld_base, coords_arg, coords);
+
+	if (opcode == TGSI_OPCODE_TXD && derivs_arg) {
+		LLVMValueRef derivs[4];
+		int axis;
+
+		/* Convert cube derivatives to 2D derivatives. */
+		for (axis = 0; axis < 2; axis++) {
+			LLVMValueRef shifted_cube_coords[4], shifted_coords[4];
+
+			/* Shift the cube coordinates by the derivatives to get
+			 * the cube coordinates of the "neighboring pixel".
+			 */
+			for (i = 0; i < 3; i++)
+				shifted_cube_coords[i] =
+					LLVMBuildFAdd(builder, coords_arg[i],
+						      derivs_arg[axis*3+i], "");
+			shifted_cube_coords[3] = LLVMGetUndef(bld_base->base.elem_type);
+
+			/* Project the shifted cube coordinates onto the face. */
+			radeon_llvm_cube_to_2d_coords(bld_base, shifted_cube_coords,
+						      shifted_coords);
+
+			/* Subtract both sets of 2D coordinates to get 2D derivatives.
+			 * This won't work if the shifted coordinates ended up
+			 * in a different face.
+			 */
+			for (i = 0; i < 2; i++)
+				derivs[axis * 2 + i] =
+					LLVMBuildFSub(builder, shifted_coords[i],
+						      coords[i], "");
+		}
+
+		memcpy(derivs_arg, derivs, sizeof(derivs));
+	}
 
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
@@ -756,140 +860,6 @@ void radeon_llvm_emit_prepare_cube_coords(
 	memcpy(coords_arg, coords, sizeof(coords));
 }
 
-static void txd_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[4];
-	unsigned chan, src;
-	for (src = 0; src < 3; src++) {
-		for (chan = 0; chan < 4; chan++)
-			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
-
-		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
-				coords, 4);
-	}
-	emit_data->arg_count = 3;
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[5];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	/* XXX: lp_build_swizzle_aos() was failing with wrong arg types,
-	 * when we used CHAN_ALL.  We should be able to get this to work,
-	 * but for now we will swizzle it ourselves
-	emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
-						 0, CHAN_ALL);
-
-	*/
-
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[5];
-	unsigned chan;
-	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
-	}
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
-	}
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-	}
-
-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-static void txf_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	const struct tgsi_texture_offset * off = inst->TexOffsets;
-	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
-
-	/* fetch tex coords */
-	tex_fetch_args(bld_base, emit_data);
-
-	/* fetch tex offsets */
-	if (inst->Texture.NumOffsets) {
-		assert(inst->Texture.NumOffsets == 1);
-
-		emit_data->args[1] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleX],
-			offset_type);
-		emit_data->args[2] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleY],
-			offset_type);
-		emit_data->args[3] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleZ],
-			offset_type);
-	} else {
-		emit_data->args[1] = bld_base->int_bld.zero;
-		emit_data->args[2] = bld_base->int_bld.zero;
-		emit_data->args[3] = bld_base->int_bld.zero;
-	}
-
-	emit_data->arg_count = 4;
-}
-
 static void emit_icmp(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -996,6 +966,35 @@ static void emit_fcmp(
 	emit_data->output[emit_data->chan] = v;
 }
 
+static void emit_dcmp(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+	LLVMRealPredicate pred;
+
+	/* Use ordered for everything but NE (which is usual for
+	 * float comparisons)
+	 */
+	switch (emit_data->inst->Instruction.Opcode) {
+	case TGSI_OPCODE_DSEQ: pred = LLVMRealOEQ; break;
+	case TGSI_OPCODE_DSGE: pred = LLVMRealOGE; break;
+	case TGSI_OPCODE_DSLT: pred = LLVMRealOLT; break;
+	case TGSI_OPCODE_DSNE: pred = LLVMRealUNE; break;
+	default: assert(!"unknown instruction"); pred = 0; break;
+	}
+
+	LLVMValueRef v = LLVMBuildFCmp(builder, pred,
+			emit_data->args[0], emit_data->args[1],"");
+
+	v = LLVMBuildSExtOrBitCast(builder, v,
+			LLVMInt32TypeInContext(context), "");
+
+	emit_data->output[emit_data->chan] = v;
+}
+
 static void emit_not(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1161,6 +1160,40 @@ static void emit_ineg(
 			emit_data->args[0], "");
 }
 
+static void emit_dneg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFNeg(builder,
+			emit_data->args[0], "");
+}
+
+static void emit_frac(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	char *intr;
+
+	if (emit_data->info->opcode == TGSI_OPCODE_FRC)
+		intr = "llvm.floor.f32";
+	else if (emit_data->info->opcode == TGSI_OPCODE_DFRAC)
+		intr = "llvm.floor.f64";
+	else {
+		assert(0);
+		return;
+	}
+
+	LLVMValueRef floor = lp_build_intrinsic(builder, intr, emit_data->dst_type,
+						&emit_data->args[0], 1,
+						LLVMReadNoneAttribute);
+	emit_data->output[emit_data->chan] = LLVMBuildFSub(builder,
+			emit_data->args[0], floor, "");
+}
+
 static void emit_f2i(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1215,58 +1248,16 @@ static void emit_immediate(struct lp_build_tgsi_context * bld_base,
 	ctx->soa.num_immediates++;
 }
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-                   const char *name,
-                   LLVMTypeRef ret_type,
-                   LLVMValueRef *args,
-                   unsigned num_args,
-                   LLVMAttribute attr)
-{
-   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-   LLVMValueRef function;
-
-   function = LLVMGetNamedFunction(module, name);
-   if(!function) {
-      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
-      unsigned i;
-
-      assert(num_args <= LP_MAX_FUNC_ARGS);
-
-      for(i = 0; i < num_args; ++i) {
-         assert(args[i]);
-         arg_types[i] = LLVMTypeOf(args[i]);
-      }
-
-      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
-
-      if (attr)
-          LLVMAddFunctionAttr(function, attr);
-   }
-
-   return LLVMBuildCall(builder, function, args, num_args, "");
-}
-
-static void build_tgsi_intrinsic(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data,
- LLVMAttribute attr)
-{
-   struct lp_build_context * base = &bld_base->base;
-   emit_data->output[emit_data->chan] = build_intrinsic(
-               base->gallivm->builder, action->intr_name,
-               emit_data->dst_type, emit_data->args,
-               emit_data->arg_count, attr);
-}
-
 void
-build_tgsi_intrinsic_nomem(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
+build_tgsi_intrinsic_nomem(const struct lp_build_tgsi_action *action,
+			   struct lp_build_tgsi_context *bld_base,
+			   struct lp_build_emit_data *emit_data)
 {
-	build_tgsi_intrinsic(action, bld_base, emit_data, LLVMReadNoneAttribute);
+	struct lp_build_context * base = &bld_base->base;
+	emit_data->output[emit_data->chan] =
+		lp_build_intrinsic(base->gallivm->builder, action->intr_name,
+				   emit_data->dst_type, emit_data->args,
+				   emit_data->arg_count, LLVMReadNoneAttribute);
 }
 
 static void emit_bfi(const struct lp_build_tgsi_action * action,
@@ -1322,7 +1313,7 @@ static void emit_lsb(const struct lp_build_tgsi_action * action,
 	};
 
 	emit_data->output[emit_data->chan] =
-		build_intrinsic(gallivm->builder, "llvm.cttz.i32",
+		lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 }
@@ -1341,7 +1332,7 @@ static void emit_umsb(const struct lp_build_tgsi_action * action,
 	};
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.ctlz.i32",
+		lp_build_intrinsic(builder, "llvm.ctlz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 
@@ -1368,7 +1359,7 @@ static void emit_imsb(const struct lp_build_tgsi_action * action,
 	LLVMValueRef arg = emit_data->args[0];
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
+		lp_build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
 				emit_data->dst_type, &arg, 1,
 				LLVMReadNoneAttribute);
 
@@ -1407,12 +1398,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 						ctx->gallivm.context);
 	ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context);
 
-	ctx->store_output_intr = "llvm.AMDGPU.store.output.";
-	ctx->swizzle_intr = "llvm.AMDGPU.swizzle";
 	struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
 
-	/* XXX: We need to revisit this.I think the correct way to do this is
-	 * to use length = 4 here and use the elem_bld for everything. */
 	type.floating = TRUE;
 	type.fixed = FALSE;
 	type.sign = TRUE;
@@ -1423,28 +1410,32 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
 	lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
 	lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type));
+	{
+		struct lp_type dbl_type;
+		dbl_type = type;
+		dbl_type.width *= 2;
+		lp_build_context_init(&ctx->soa.bld_base.dbl_bld, &ctx->gallivm, dbl_type);
+	}
 
 	bld_base->soa = 1;
-	bld_base->emit_store = emit_store;
+	bld_base->emit_store = radeon_llvm_emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
 	bld_base->emit_immediate = emit_immediate;
 
-	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = radeon_llvm_emit_fetch;
 	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
 
 	/* Allocate outputs */
 	ctx->soa.outputs = ctx->outputs;
 
-	/* XXX: Is there a better way to initialize all this ? */
-
 	lp_set_default_actions(bld_base);
 
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "fabs";
+	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.fabs.f32";
 	bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and;
 	bld_base->op_actions[TGSI_OPCODE_ARL].emit = emit_arl;
 	bld_base->op_actions[TGSI_OPCODE_BFI].emit = emit_bfi;
@@ -1453,7 +1444,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_BREV].intr_name = "llvm.AMDGPU.brev";
 	bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
 	bld_base->op_actions[TGSI_OPCODE_CEIL].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "ceil";
+	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32";
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp.";
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem;
@@ -1461,21 +1452,30 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32";
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
-	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "llvm.fabs.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac;
+	bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg;
+	bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64";
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "floor";
+	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FMA].intr_name = "llvm.fma.f32";
-	bld_base->op_actions[TGSI_OPCODE_FRC].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
+	bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac;
 	bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i;
 	bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u;
 	bld_base->op_actions[TGSI_OPCODE_FSEQ].emit = emit_fcmp;
@@ -1520,6 +1520,9 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32";
 	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name =
+		HAVE_LLVM >= 0x0305 ? "llvm.AMDGPU.rsq.clamped.f32" : "llvm.AMDGPU.rsq";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
@@ -1532,26 +1535,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32";
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
-	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
-	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
-	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
 	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
@@ -1571,13 +1554,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
 	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
-
-	bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem;
-#if HAVE_LLVM >= 0x0305
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq.clamped.f32";
-#else
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
-#endif
 }
 
 void radeon_llvm_create_func(struct radeon_llvm_context * ctx,
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index be58d0b9ce3..16ee5410273 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,6 +57,7 @@
 
 #define FB_BUFFER_OFFSET 0x1000
 #define FB_BUFFER_SIZE 2048
+#define IT_SCALING_TABLE_SIZE 992
 
 /* UVD decoder representation */
 struct ruvd_decoder {
@@ -65,6 +66,7 @@ struct ruvd_decoder {
 	ruvd_set_dtb			set_dtb;
 
 	unsigned			stream_handle;
+	unsigned			stream_type;
 	unsigned			frame_number;
 
 	struct pipe_screen		*screen;
@@ -73,15 +75,18 @@ struct ruvd_decoder {
 
 	unsigned			cur_buffer;
 
-	struct rvid_buffer		msg_fb_buffers[NUM_BUFFERS];
+	struct rvid_buffer		msg_fb_it_buffers[NUM_BUFFERS];
 	struct ruvd_msg			*msg;
 	uint32_t			*fb;
+	uint8_t				*it;
 
 	struct rvid_buffer		bs_buffers[NUM_BUFFERS];
 	void*				bs_ptr;
 	unsigned			bs_size;
 
 	struct rvid_buffer		dpb;
+	bool				use_legacy;
+	struct rvid_buffer		ctx;
 };
 
 /* flush IB to the hardware */
@@ -107,19 +112,34 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 
 	reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain,
 					  RADEON_PRIO_MIN);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	if (!dec->use_legacy) {
+		uint64_t addr;
+		addr = dec->ws->buffer_get_virtual_address(cs_buf);
+		addr = addr + off;
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
+	} else {
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	}
 	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
 }
 
-/* map the next available message/feedback buffer */
-static void map_msg_fb_buf(struct ruvd_decoder *dec)
+/* do the codec needs an IT buffer ?*/
+static bool have_it(struct ruvd_decoder *dec)
+{
+	return dec->stream_type == RUVD_CODEC_H264_PERF ||
+		dec->stream_type == RUVD_CODEC_H265;
+}
+
+/* map the next available message/feedback/itscaling buffer */
+static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 {
 	struct rvid_buffer* buf;
 	uint8_t *ptr;
 
 	/* grab the current message/feedback buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* and map it for CPU access */
 	ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
@@ -127,6 +147,8 @@ static void map_msg_fb_buf(struct ruvd_decoder *dec)
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
+	if (have_it(dec))
+		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
 }
 
 /* unmap and send a message command to the VCPU */
@@ -139,12 +161,13 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 		return;
 
 	/* grab the current message buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* unmap the buffer */
 	dec->ws->buffer_unmap(buf->res->cs_buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
+	dec->it = NULL;
 
 	/* and send it to the hardware */
 	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
@@ -159,11 +182,12 @@ static void next_buffer(struct ruvd_decoder *dec)
 }
 
 /* convert the profile into something UVD understands */
-static uint32_t profile2stream_type(enum pipe_video_profile profile)
+static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 {
-	switch (u_reduce_video_profile(profile)) {
+	switch (u_reduce_video_profile(dec->base.profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		return RUVD_CODEC_H264;
+		return (family >= CHIP_TONGA) ?
+			RUVD_CODEC_H264_PERF : RUVD_CODEC_H264;
 
 	case PIPE_VIDEO_FORMAT_VC1:
 		return RUVD_CODEC_VC1;
@@ -174,23 +198,46 @@ static uint32_t profile2stream_type(enum pipe_video_profile profile)
 	case PIPE_VIDEO_FORMAT_MPEG4:
 		return RUVD_CODEC_MPEG4;
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		return RUVD_CODEC_H265;
+
 	default:
 		assert(0);
 		return 0;
 	}
 }
 
+static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, ctx_size;
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	width = align (width, 16);
+	height = align (height, 16);
+	ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
+	return ctx_size;
+}
+
 /* calculate size of reference picture buffer */
-static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
+static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
 	unsigned width_in_mb, height_in_mb, image_size, dpb_size;
 
 	// always align them to MB size for dpb calculation
-	unsigned width = align(templ->width, VL_MACROBLOCK_WIDTH);
-	unsigned height = align(templ->height, VL_MACROBLOCK_HEIGHT);
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
 
 	// always one more for currently decoded picture
-	unsigned max_references = templ->max_references + 1;
+	unsigned max_references = dec->base.max_references + 1;
 
 	// aligned size of a single frame
 	image_size = width * height;
@@ -201,19 +248,67 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 	width_in_mb = width / VL_MACROBLOCK_WIDTH;
 	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
 
-	switch (u_reduce_video_profile(templ->profile)) {
-	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		// the firmware seems to allways assume a minimum of ref frames
-		max_references = MAX2(NUM_H264_REFS, max_references);
-
-		// reference picture buffer
-		dpb_size = image_size * max_references;
+	switch (u_reduce_video_profile(dec->base.profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+		if (!dec->use_legacy) {
+			unsigned fs_in_mb = width_in_mb * height_in_mb;
+			unsigned alignment = 64, num_dpb_buffer;
+
+			if (dec->stream_type == RUVD_CODEC_H264_PERF)
+				alignment = 256;
+			switch(dec->base.level) {
+			case 30:
+				num_dpb_buffer = 8100 / fs_in_mb;
+				break;
+			case 31:
+				num_dpb_buffer = 18000 / fs_in_mb;
+				break;
+			case 32:
+				num_dpb_buffer = 20480 / fs_in_mb;
+				break;
+			case 41:
+				num_dpb_buffer = 32768 / fs_in_mb;
+				break;
+			case 42:
+				num_dpb_buffer = 34816 / fs_in_mb;
+				break;
+			case 50:
+				num_dpb_buffer = 110400 / fs_in_mb;
+				break;
+			case 51:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			default:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			}
+			num_dpb_buffer++;
+			max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+			dpb_size = image_size * max_references;
+			dpb_size += max_references * align(width_in_mb * height_in_mb  * 192, alignment);
+			dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+		} else {
+			// the firmware seems to allways assume a minimum of ref frames
+			max_references = MAX2(NUM_H264_REFS, max_references);
+			// reference picture buffer
+			dpb_size = image_size * max_references;
+			// macroblock context buffer
+			dpb_size += width_in_mb * height_in_mb * max_references * 192;
+			// IT surface buffer
+			dpb_size += width_in_mb * height_in_mb * 32;
+		}
+		break;
+	}
 
-		// macroblock context buffer
-		dpb_size += width_in_mb * height_in_mb * max_references * 192;
+	case PIPE_VIDEO_FORMAT_HEVC:
+		if (dec->base.width * dec->base.height >= 4096*2000)
+			max_references = MAX2(max_references, 8);
+		else
+			max_references = MAX2(max_references, 17);
 
-		// IT surface buffer
-		dpb_size += width_in_mb * height_in_mb * 32;
+		width = align (width, 16);
+		height = align (height, 16);
+		dpb_size = align((width * height * 3) / 2, 256) * max_references;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -250,6 +345,8 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 
 		// IT surface buffer
 		dpb_size += align(width_in_mb * height_in_mb * 32, 64);
+
+		dpb_size = MAX2(dpb_size, 30 * 1024 * 1024);
 		break;
 
 	default:
@@ -263,6 +360,12 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 	return dpb_size;
 }
 
+/* free associated data in the video buffer callback */
+static void ruvd_destroy_associated_data(void *data)
+{
+	/* NOOP, since we only use an intptr */
+}
+
 /* get h264 specific message bits */
 static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_picture_desc *pic)
 {
@@ -286,10 +389,8 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 		assert(0);
 		break;
 	}
-	if (((dec->base.width * dec->base.height) >> 8) <= 1620)
-		result.level = 30;
-	else
-		result.level = 41;
+
+	result.level = dec->base.level;
 
 	result.sps_info_flags = 0;
 	result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0;
@@ -338,6 +439,11 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16);
 	memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64);
 
+	if (dec->stream_type == RUVD_CODEC_H264_PERF) {
+		memcpy(dec->it, result.scaling_list_4x4, 6*16);
+		memcpy((dec->it + 96), result.scaling_list_8x8, 2*64);
+	}
+
 	result.num_ref_frames = pic->num_ref_frames;
 
 	result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
@@ -354,6 +460,151 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	return result;
 }
 
+/* get h265 specific message bits */
+static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target,
+				     struct pipe_h265_picture_desc *pic)
+{
+	struct ruvd_h265 result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+
+	result.sps_info_flags = 0;
+	result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0;
+	result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1;
+	result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2;
+	result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3;
+	result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4;
+	result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5;
+	result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6;
+	result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7;
+	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
+	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
+		result.sps_info_flags |= 1 << 9;
+
+	result.chroma_format = pic->pps->sps->chroma_format_idc;
+	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
+	result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
+	result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
+	result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1;
+	result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+	result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2;
+	result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size;
+	result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter;
+	result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra;
+	result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1;
+	result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1;
+	result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size;
+	result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets;
+
+	result.pps_info_flags = 0;
+	result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0;
+	result.pps_info_flags |= pic->pps->output_flag_present_flag << 1;
+	result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2;
+	result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3;
+	result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4;
+	result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5;
+	result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6;
+	result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7;
+	result.pps_info_flags |= pic->pps->weighted_pred_flag << 8;
+	result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9;
+	result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10;
+	result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11;
+	result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12;
+	result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13;
+	result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14;
+	result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15;
+	result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16;
+	result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17;
+	result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18;
+	result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19;
+	//result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ???
+
+	result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits;
+	result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps;
+	result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1;
+	result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1;
+	result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset;
+	result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset;
+	result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2;
+	result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2;
+	result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth;
+	result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1;
+	result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1;
+	result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2;
+	result.init_qp_minus26 = pic->pps->init_qp_minus26;
+
+	for (i = 0; i < 19; ++i)
+		result.column_width_minus1[i] = pic->pps->column_width_minus1[i];
+
+	for (i = 0; i < 21; ++i)
+		result.row_height_minus1[i] = pic->pps->row_height_minus1[i];
+
+	result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx;
+	result.curr_idx = pic->CurrPicOrderCntVal;
+	result.curr_poc = pic->CurrPicOrderCntVal;
+
+	vl_video_buffer_set_associated_data(target, &dec->base,
+					    (void *)(uintptr_t)pic->CurrPicOrderCntVal,
+					    &ruvd_destroy_associated_data);
+
+	for (i = 0; i < 16; ++i) {
+		struct pipe_video_buffer *ref = pic->ref[i];
+		uintptr_t ref_pic = 0;
+
+		result.poc_list[i] = pic->PicOrderCntVal[i];
+
+		if (ref)
+			ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base);
+		else
+			ref_pic = 0x7F;
+		result.ref_pic_list[i] = ref_pic;
+	}
+
+	for (i = 0; i < 8; ++i) {
+		result.ref_pic_set_st_curr_before[i] = 0xFF;
+		result.ref_pic_set_st_curr_after[i] = 0xFF;
+		result.ref_pic_set_lt_curr[i] = 0xFF;
+	}
+
+	for (i = 0; i < pic->NumPocStCurrBefore; ++i)
+		result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i];
+
+	for (i = 0; i < pic->NumPocStCurrAfter; ++i)
+		result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i];
+
+	for (i = 0; i < pic->NumPocLtCurr; ++i)
+		result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i];
+
+	for (i = 0; i < 6; ++i)
+		result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i];
+
+	for (i = 0; i < 2; ++i)
+		result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i];
+
+	memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16);
+	memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64);
+	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
+	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
+
+	/* TODO
+	result.highestTid;
+	result.isNonRef;
+
+	IDRPicFlag;
+	RAPPicFlag;
+	NumPocTotalCurr;
+	NumShortTermPictureSliceHeaderBits;
+	NumLongTermPictureSliceHeaderBits;
+
+	IsLongTerm[16];
+	*/
+
+	return result;
+}
+
 /* get vc1 specific message bits */
 static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic)
 {
@@ -556,7 +807,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 
 	assert(decoder);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	memset(dec->msg, 0, sizeof(*dec->msg));
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DESTROY;
@@ -568,21 +819,17 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 }
 
-/* free associated data in the video buffer callback */
-static void ruvd_destroy_associated_data(void *data)
-{
-	/* NOOP, since we only use an intptr */
-}
-
 /**
  * start decoding of a new frame
  */
@@ -670,7 +917,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 {
 	struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
 	struct radeon_winsys_cs_handle *dt;
-	struct rvid_buffer *msg_fb_buf, *bs_buf;
+	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
 	unsigned bs_size;
 
 	assert(decoder);
@@ -678,26 +925,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	if (!dec->bs_ptr)
 		return;
 
-	msg_fb_buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 	bs_buf = &dec->bs_buffers[dec->cur_buffer];
 
 	bs_size = align(dec->bs_size, 128);
 	memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
 	dec->ws->buffer_unmap(bs_buf->res->cs_buf);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DECODE;
 	dec->msg->stream_handle = dec->stream_handle;
 	dec->msg->status_report_feedback_number = dec->frame_number;
 
-	dec->msg->body.decode.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.decode.stream_type = dec->stream_type;
 	dec->msg->body.decode.decode_flags = 0x1;
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
+	dec->msg->body.decode.db_pitch = dec->base.width;
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
 
@@ -706,6 +954,10 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 		dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture);
 		break;
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+		break;
+
 	case PIPE_VIDEO_FORMAT_VC1:
 		dec->msg->body.decode.codec.vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture);
 		break;
@@ -733,12 +985,19 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
 		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	}
 	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
-	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
+	if (have_it(dec))
+		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
+			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
 
 	flush(dec);
@@ -760,7 +1019,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 					     ruvd_set_dtb set_dtb)
 {
 	struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws;
-	unsigned dpb_size = calc_dpb_size(templ);
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
+	unsigned dpb_size;
 	unsigned width = templ->width, height = templ->height;
 	unsigned bs_buf_size;
 	struct radeon_info info;
@@ -791,6 +1051,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	if (!dec)
 		return NULL;
 
+	if (info.drm_major < 3)
+		dec->use_legacy = TRUE;
+
 	dec->base = *templ;
 	dec->base.context = context;
 	dec->base.width = width;
@@ -803,11 +1066,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->base.end_frame = ruvd_end_frame;
 	dec->base.flush = ruvd_flush;
 
+	dec->stream_type = profile2stream_type(dec, info.family);
 	dec->set_dtb = set_dtb;
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(ws, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
@@ -815,10 +1079,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	bs_buf_size = width * height * 512 / (16 * 16);
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		unsigned msg_fb_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
 		STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
-		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_buffers[i],
-					msg_fb_size, PIPE_USAGE_STAGING)) {
+		if (have_it(dec))
+			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
+		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i],
+					msg_fb_it_size, PIPE_USAGE_STAGING)) {
 			RVID_ERR("Can't allocated message buffers.\n");
 			goto error;
 		}
@@ -829,10 +1095,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 			goto error;
 		}
 
-		rvid_clear_buffer(context, &dec->msg_fb_buffers[i]);
+		rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]);
 		rvid_clear_buffer(context, &dec->bs_buffers[i]);
 	}
 
+	dpb_size = calc_dpb_size(dec);
+
 	if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't allocated dpb.\n");
 		goto error;
@@ -840,14 +1108,23 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
-	map_msg_fb_buf(dec);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		unsigned ctx_size = calc_ctx_size(dec);
+		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+			RVID_ERR("Can't allocated context buffer.\n");
+			goto error;
+		}
+		rvid_clear_buffer(context, &dec->ctx);
+	}
+
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
 	dec->msg->stream_handle = dec->stream_handle;
-	dec->msg->body.create.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.create.stream_type = dec->stream_type;
 	dec->msg->body.create.width_in_samples = dec->base.width;
 	dec->msg->body.create.height_in_samples = dec->base.height;
-	dec->msg->body.create.dpb_size = dec->dpb.res->buf->size;
+	dec->msg->body.create.dpb_size = dpb_size;
 	send_msg_buf(dec);
 	flush(dec);
 	next_buffer(dec);
@@ -858,11 +1135,13 @@ error:
 	if (dec->cs) dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 7442865c9ec..452fbd60880 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -62,6 +62,8 @@
 #define RUVD_CMD_DECODING_TARGET_BUFFER	0x00000002
 #define RUVD_CMD_FEEDBACK_BUFFER	0x00000003
 #define RUVD_CMD_BITSTREAM_BUFFER	0x00000100
+#define RUVD_CMD_ITSCALING_TABLE_BUFFER	0x00000204
+#define RUVD_CMD_CONTEXT_BUFFER		0x00000206
 
 /* UVD message types */
 #define RUVD_MSG_CREATE		0
@@ -73,6 +75,8 @@
 #define RUVD_CODEC_VC1		0x00000001
 #define RUVD_CODEC_MPEG2	0x00000003
 #define RUVD_CODEC_MPEG4	0x00000004
+#define RUVD_CODEC_H264_PERF	0x00000007
+#define RUVD_CODEC_H265		0x00000010
 
 /* UVD decode target buffer tiling mode */
 #define RUVD_TILE_LINEAR	0x00000000
@@ -171,6 +175,66 @@ struct ruvd_h264 {
 	} mvc;
 };
 
+struct ruvd_h265 {
+	uint32_t	sps_info_flags;
+	uint32_t	pps_info_flags;
+
+	uint8_t		chroma_format;
+	uint8_t		bit_depth_luma_minus8;
+	uint8_t		bit_depth_chroma_minus8;
+	uint8_t		log2_max_pic_order_cnt_lsb_minus4;
+
+	uint8_t		sps_max_dec_pic_buffering_minus1;
+	uint8_t		log2_min_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_luma_coding_block_size;
+	uint8_t		log2_min_transform_block_size_minus2;
+
+	uint8_t		log2_diff_max_min_transform_block_size;
+	uint8_t		max_transform_hierarchy_depth_inter;
+	uint8_t		max_transform_hierarchy_depth_intra;
+	uint8_t		pcm_sample_bit_depth_luma_minus1;
+
+	uint8_t		pcm_sample_bit_depth_chroma_minus1;
+	uint8_t		log2_min_pcm_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_pcm_luma_coding_block_size;
+	uint8_t		num_extra_slice_header_bits;
+
+	uint8_t		num_short_term_ref_pic_sets;
+	uint8_t		num_long_term_ref_pic_sps;
+	uint8_t		num_ref_idx_l0_default_active_minus1;
+	uint8_t		num_ref_idx_l1_default_active_minus1;
+
+	int8_t		pps_cb_qp_offset;
+	int8_t		pps_cr_qp_offset;
+	int8_t		pps_beta_offset_div2;
+	int8_t		pps_tc_offset_div2;
+
+	uint8_t		diff_cu_qp_delta_depth;
+	uint8_t		num_tile_columns_minus1;
+	uint8_t		num_tile_rows_minus1;
+	uint8_t		log2_parallel_merge_level_minus2;
+
+	uint16_t	column_width_minus1[19];
+	uint16_t	row_height_minus1[21];
+
+	int8_t		init_qp_minus26;
+	uint8_t		num_delta_pocs_ref_rps_idx;
+	uint8_t		curr_idx;
+	uint8_t		reserved1;
+	int32_t		curr_poc;
+	uint8_t		ref_pic_list[16];
+	int32_t		poc_list[16];
+	uint8_t		ref_pic_set_st_curr_before[8];
+	uint8_t		ref_pic_set_st_curr_after[8];
+	uint8_t		ref_pic_set_lt_curr[8];
+
+	uint8_t		ucScalingListDCCoefSizeID2[6];
+	uint8_t		ucScalingListDCCoefSizeID3[2];
+
+	uint8_t		highestTid;
+	uint8_t		isNonRef;
+};
+
 struct ruvd_vc1 {
 	uint32_t	profile;
 	uint32_t	level;
@@ -327,6 +391,7 @@ struct ruvd_msg {
 
 			union {
 				struct ruvd_h264	h264;
+				struct ruvd_h265	h265;
 				struct ruvd_vc1		vc1;
 				struct ruvd_mpeg2	mpeg2;
 				struct ruvd_mpeg4	mpeg4;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index a6567379fe3..7eab974a3df 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -47,6 +47,8 @@
 #define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8))
 #define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8))
 #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
+#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
+#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -54,6 +56,8 @@
 static void flush(struct rvce_encoder *enc)
 {
 	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->task_info_idx = 0;
+	enc->bs_idx = 0;
 }
 
 #if 0
@@ -214,7 +218,7 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
  * Calculate the offsets into the CPB
  */
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset)
+		       signed *luma_offset, signed *chroma_offset)
 {
 	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
 	unsigned vpitch = align(enc->luma->npix_y, 16);
@@ -278,24 +282,19 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder,
 		enc->fb = &fb;
 		enc->session(enc);
 		enc->create(enc);
-		enc->rate_control(enc);
-		need_rate_control = false;
-		enc->config_extension(enc);
-		enc->motion_estimation(enc);
-		enc->rdo(enc);
-		if (enc->use_vui)
-			enc->vui(enc);
-		enc->pic_control(enc);
+		enc->config(enc);
 		enc->feedback(enc);
 		flush(enc);
 		//dump_feedback(enc, &fb);
 		rvid_destroy_buffer(&fb);
+		need_rate_control = false;
 	}
 
-	enc->session(enc);
-
-	if (need_rate_control)
-		enc->rate_control(enc);
+	if (need_rate_control) {
+		enc->session(enc);
+		enc->config(enc);
+		flush(enc);
+	}
 }
 
 static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
@@ -312,6 +311,8 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
 		RVID_ERR("Can't create feedback buffer.\n");
 		return;
 	}
+	if (!enc->cs->cdw)
+		enc->session(enc);
 	enc->encode(enc);
 	enc->feedback(enc);
 }
@@ -324,7 +325,8 @@ static void rvce_end_frame(struct pipe_video_codec *encoder,
 	struct rvce_cpb_slot *slot = LIST_ENTRY(
 		struct rvce_cpb_slot, enc->cpb_slots.prev, list);
 
-	flush(enc);
+	if (!enc->dual_inst || enc->bs_idx > 1)
+		flush(enc);
 
 	/* update the CPB backtrack with the just encoded frame */
 	slot->picture_type = enc->pic.picture_type;
@@ -363,6 +365,9 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
  */
 static void rvce_flush(struct pipe_video_codec *encoder)
 {
+	struct rvce_encoder *enc = (struct rvce_encoder*)encoder;
+
+	flush(enc);
 }
 
 static void rvce_cs_flush(void *ctx, unsigned flags,
@@ -377,6 +382,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     rvce_get_buffer get_buffer)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)context->screen;
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
 	struct rvce_encoder *enc;
 	struct pipe_video_buffer *tmp_buf, templat = {};
 	struct radeon_surf *tmp_surf;
@@ -395,8 +401,17 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	if (!enc)
 		return NULL;
 
+	if (rscreen->info.drm_major == 3)
+		enc->use_vm = true;
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
+	if (rscreen->info.family >= CHIP_TONGA)
+		enc->dual_pipe = true;
+	/* TODO enable B frame with dual instance */
+	if ((rscreen->info.family >= CHIP_TONGA) &&
+		(templ->max_references == 1) &&
+		(rscreen->info.vce_harvest_config == 0))
+		enc->dual_inst = true;
 
 	enc->base = *templ;
 	enc->base.context = context;
@@ -411,7 +426,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(ws, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
@@ -436,6 +451,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
+	if (enc->dual_pipe)
+		cpb_size +=  RVCE_MAX_AUX_BUFFER_NUM *
+			RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
 	tmp_buf->destroy(tmp_buf);
 	if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't create CPB buffer.\n");
@@ -455,6 +473,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	case FW_50_0_1:
 	case FW_50_1_2:
+	case FW_50_10_2:
+	case FW_50_17_3:
 		radeon_vce_50_init(enc);
 		break;
 
@@ -482,5 +502,29 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
 	return rscreen->info.vce_fw_version == FW_40_2_2 ||
 		rscreen->info.vce_fw_version == FW_50_0_1 ||
-		rscreen->info.vce_fw_version == FW_50_1_2;
+		rscreen->info.vce_fw_version == FW_50_1_2 ||
+		rscreen->info.vce_fw_version == FW_50_10_2 ||
+		rscreen->info.vce_fw_version == FW_50_17_3;
+}
+
+/**
+ * Add the buffer as relocation to the current command submission
+ */
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+                     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+                     signed offset)
+{
+	int reloc_idx;
+
+	reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN);
+	if (enc->use_vm) {
+		uint64_t addr;
+		addr = enc->ws->buffer_get_virtual_address(buf);
+		addr = addr + offset;
+		RVCE_CS(addr >> 32);
+		RVCE_CS(addr);
+	} else {
+		RVCE_CS(reloc_idx * 4);
+		RVCE_CS(offset);
+	}
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 8319ef48cd5..624bda479f8 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -36,15 +36,16 @@
 
 #include "util/list.h"
 
-#define RVCE_RELOC(buf, usage, domain) (enc->ws->cs_add_reloc(enc->cs, (buf), (usage), domain, RADEON_PRIO_MIN))
-
 #define RVCE_CS(value) (enc->cs->buf[enc->cs->cdw++] = (value))
 #define RVCE_BEGIN(cmd) { uint32_t *begin = &enc->cs->buf[enc->cs->cdw++]; RVCE_CS(cmd)
-#define RVCE_READ(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READ, domain) * 4)
-#define RVCE_WRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_WRITE, domain) * 4)
-#define RVCE_READWRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READWRITE, domain) * 4)
+#define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off))
+#define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off))
+#define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off))
 #define RVCE_END() *begin = (&enc->cs->buf[enc->cs->cdw] - begin) * 4; }
 
+#define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5)
+#define RVCE_MAX_AUX_BUFFER_NUM 4
+
 struct r600_common_screen;
 
 /* driver dependent callback */
@@ -76,8 +77,12 @@ struct rvce_encoder {
 	void (*motion_estimation)(struct rvce_encoder *enc);
 	void (*rdo)(struct rvce_encoder *enc);
 	void (*vui)(struct rvce_encoder *enc);
+	void (*config)(struct rvce_encoder *enc);
 	void (*encode)(struct rvce_encoder *enc);
 	void (*destroy)(struct rvce_encoder *enc);
+	void (*task_info)(struct rvce_encoder *enc, uint32_t op,
+			  uint32_t dep, uint32_t fb_idx,
+			  uint32_t ring_idx);
 
 	unsigned			stream_handle;
 
@@ -101,7 +106,14 @@ struct rvce_encoder {
 	struct rvid_buffer		*fb;
 	struct rvid_buffer		cpb;
 	struct pipe_h264_enc_picture_desc pic;
-	bool use_vui;
+
+	unsigned			task_info_idx;
+	unsigned			bs_idx;
+
+	bool				use_vm;
+	bool				use_vui;
+	bool				dual_pipe;
+	bool				dual_inst;
 };
 
 /* CPB handling functions */
@@ -109,7 +121,7 @@ struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc);
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset);
+		       signed *luma_offset, signed *chroma_offset);
 
 struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     const struct pipe_video_codec *templat,
@@ -118,6 +130,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+		     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+		     signed offset);
+
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 51b17b5f6a8..e64fbc7afb0 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -53,30 +53,38 @@ static void session(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
+static void task_info(struct rvce_encoder *enc, uint32_t op,
+		      uint32_t dep, uint32_t fb_idx, uint32_t ring_idx)
 {
 	RVCE_BEGIN(0x00000002); // task info
+	if (op == 0x3) {
+		if (enc->task_info_idx) {
+			uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3;
+			// Update offsetOfNextTaskInfo
+			enc->cs->buf[enc->task_info_idx] = offs;
+		}
+		enc->task_info_idx = enc->cs->cdw;
+	}
 	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
+	RVCE_CS(op); // taskOperation
+	RVCE_CS(dep); // referencePictureDependency
 	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
+	RVCE_CS(fb_idx); // feedbackIndex
+	RVCE_CS(ring_idx); // videoBitstreamRingIndex
 	RVCE_END();
 }
 
 static void feedback(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x05000005); // feedback buffer
-	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains); // feedbackRingAddressHi
-	RVCE_CS(0x00000000); // feedbackRingAddressLo
+	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
 	RVCE_CS(0x00000001); // feedbackRingSize
 	RVCE_END();
 }
 
 static void create(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000000);
+	enc->task_info(enc, 0x00000000, 0, 0, 0);
 
 	RVCE_BEGIN(0x01000001); // create cmd
 	RVCE_CS(0x00000000); // encUseCircularBuffer
@@ -272,21 +280,31 @@ static void vui(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
+static void config(struct rvce_encoder *enc)
+{
+	enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0);
+	enc->rate_control(enc);
+	enc->config_extension(enc);
+	enc->motion_estimation(enc);
+	enc->rdo(enc);
+	if (enc->use_vui)
+		enc->vui(enc);
+	enc->pic_control(enc);
+}
+
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0x0); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
@@ -298,10 +316,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
@@ -404,7 +422,7 @@ static void encode(struct rvce_encoder *enc)
 
 static void destroy(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000001);
+	enc->task_info(enc, 0x00000001, 0, 0, 0);
 
 	RVCE_BEGIN(0x02000001); // destroy
 	RVCE_END();
@@ -413,6 +431,7 @@ static void destroy(struct rvce_encoder *enc)
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 {
 	enc->session = session;
+	enc->task_info = task_info;
 	enc->create = create;
 	enc->feedback = feedback;
 	enc->rate_control = rate_control;
@@ -421,6 +440,7 @@ void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 	enc->motion_estimation = motion_estimation;
 	enc->rdo = rdo;
 	enc->vui = vui;
+	enc->config = config;
 	enc->encode = encode;
 	enc->destroy = destroy;
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 84a2bfb117e..afdab18c0d3 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -44,18 +44,6 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
-{
-	RVCE_BEGIN(0x00000002); // task info
-	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
-	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
-	RVCE_END();
-}
-
 static void rate_control(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x04000005); // rate control
@@ -90,22 +78,46 @@ static void rate_control(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset, bs_offset;
+	unsigned dep, bs_idx = enc->bs_idx++;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	if (enc->dual_inst) {
+		if (bs_idx == 0)
+			dep = 1;
+		else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+			dep = 0;
+		else
+			dep = 2;
+	} else
+		dep = 0;
+
+	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
+	bs_offset = -(signed)(bs_idx * enc->bs_size);
+
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, bs_offset); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
+	if (enc->dual_pipe) {
+		unsigned aux_offset = enc->cpb.res->buf->size -
+			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
+		RVCE_BEGIN(0x05000002); // auxiliary buffer
+		for (i = 0; i < 8; ++i) {
+			RVCE_CS(aux_offset);
+			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		}
+		for (i = 0; i < 8; ++i)
+			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
+		RVCE_END();
+	}
+
 	RVCE_BEGIN(0x03000001); // encode
 	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
 	RVCE_CS(0x00000000); // pictureStructure
@@ -114,14 +126,17 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
-	RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	if (enc->dual_pipe)
+		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	else
+		RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
 	RVCE_CS(0x00000000); // encInputPicTileConfig
 	RVCE_CS(enc->pic.picture_type); // encPicType
 	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 826e0763c08..3a1834b948f 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -214,9 +214,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
         	        return 1;
 	        case PIPE_VIDEO_CAP_MAX_WIDTH:
-        	        return 2048;
+			return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	        case PIPE_VIDEO_CAP_MAX_HEIGHT:
-        	        return 1152;
+			return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	        case PIPE_VIDEO_CAP_PREFERED_FORMAT:
         	        return PIPE_FORMAT_NV12;
 	        case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -225,6 +225,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
         	        return false;
 	        case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
         	        return true;
+	        case PIPE_VIDEO_CAP_STACKED_FRAMES:
+			return (rscreen->family < CHIP_TONGA) ? 1 : 2;
 	        default:
         	        return 0;
 		}
@@ -262,20 +264,28 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			/* FIXME: VC-1 simple/main profile is broken */
 			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED &&
 			       entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+		case PIPE_VIDEO_FORMAT_HEVC:
+			/* Carrizo only supports HEVC Main */
+			return rscreen->family >= CHIP_CARRIZO &&
+				   profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 		default:
 			return false;
 		}
 	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 		return 1;
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
-		return 2048;
+		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return 1152;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 		return true;
@@ -300,6 +310,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
 			return 41;
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+			return 186;
 		default:
 			return 0;
 		}
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 3bfbb6d75b7..7ab6e56e099 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -42,12 +42,9 @@
 
 #include "pipebuffer/pb_buffer.h"
 
-#define RADEON_MAX_CMDBUF_DWORDS (16 * 1024)
-
 #define RADEON_FLUSH_ASYNC		(1 << 0)
 #define RADEON_FLUSH_KEEP_TILING_FLAGS	(1 << 1) /* needs DRM 2.12.0 */
-#define RADEON_FLUSH_COMPUTE		(1 << 2)
-#define RADEON_FLUSH_END_OF_FRAME       (1 << 3)
+#define RADEON_FLUSH_END_OF_FRAME       (1 << 2)
 
 /* Tiling flags. */
 enum radeon_bo_layout {
@@ -136,6 +133,10 @@ enum radeon_family {
     CHIP_KABINI,
     CHIP_HAWAII,
     CHIP_MULLINS,
+    CHIP_TONGA,
+    CHIP_ICELAND,
+    CHIP_CARRIZO,
+    CHIP_FIJI,
     CHIP_LAST,
 };
 
@@ -150,10 +151,12 @@ enum chip_class {
     CAYMAN,
     SI,
     CIK,
+    VI,
 };
 
 enum ring_type {
     RING_GFX = 0,
+    RING_COMPUTE,
     RING_DMA,
     RING_UVD,
     RING_VCE,
@@ -169,9 +172,10 @@ enum radeon_value_id {
     RADEON_NUM_BYTES_MOVED,
     RADEON_VRAM_USAGE,
     RADEON_GTT_USAGE,
-    RADEON_GPU_TEMPERATURE,
+    RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
     RADEON_CURRENT_SCLK,
-    RADEON_CURRENT_MCLK
+    RADEON_CURRENT_MCLK,
+    RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
 };
 
 enum radeon_bo_priority {
@@ -192,9 +196,11 @@ enum radeon_bo_priority {
 
 struct winsys_handle;
 struct radeon_winsys_cs_handle;
+struct radeon_winsys_ctx;
 
 struct radeon_winsys_cs {
     unsigned                    cdw;  /* Number of used dwords. */
+    unsigned                    max_dw; /* Maximum number of dwords. */
     uint32_t                    *buf; /* The command buffer. */
     enum ring_type              ring_type;
 };
@@ -238,6 +244,7 @@ struct radeon_info {
 
     boolean                     cik_macrotile_mode_array_valid;
     uint32_t                    cik_macrotile_mode_array[16];
+    uint32_t                    vce_harvest_config;
 };
 
 enum radeon_feature_id {
@@ -317,6 +324,8 @@ struct radeon_surf {
     struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
     uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
+    uint32_t                    pipe_config;
+    uint32_t                    num_banks;
 };
 
 struct radeon_winsys {
@@ -398,24 +407,15 @@ struct radeon_winsys {
     void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
 
     /**
-     * Return TRUE if a buffer object is being used by the GPU.
-     *
-     * \param buf       A winsys buffer object.
-     * \param usage     Only check whether the buffer is busy for the given usage.
-     */
-    boolean (*buffer_is_busy)(struct pb_buffer *buf,
-                              enum radeon_bo_usage usage);
-
-    /**
-     * Wait for a buffer object until it is not used by a GPU. This is
-     * equivalent to a fence placed after the last command using the buffer,
-     * and synchronizing to the fence.
+     * Wait for the buffer and return true if the buffer is not used
+     * by the device.
      *
-     * \param buf       A winsys buffer object to wait for.
-     * \param usage     Only wait until the buffer is idle for the given usage,
-     *                  but may still be busy for some other usage.
+     * The timeout of 0 will only return the status.
+     * The timeout of PIPE_TIMEOUT_INFINITE will always wait until the buffer
+     * is idle.
      */
-    void (*buffer_wait)(struct pb_buffer *buf, enum radeon_bo_usage usage);
+    bool (*buffer_wait)(struct pb_buffer *buf, uint64_t timeout,
+                        enum radeon_bo_usage usage);
 
     /**
      * Return tiling flags describing a memory layout of a buffer object.
@@ -450,10 +450,11 @@ struct radeon_winsys {
                               struct radeon_winsys_cs *rcs,
                               enum radeon_bo_layout microtile,
                               enum radeon_bo_layout macrotile,
+                              unsigned pipe_config,
                               unsigned bankw, unsigned bankh,
                               unsigned tile_split,
                               unsigned stencil_tile_split,
-                              unsigned mtilea,
+                              unsigned mtilea, unsigned num_banks,
                               unsigned stride,
                               bool scanout);
 
@@ -515,15 +516,31 @@ struct radeon_winsys {
      *************************************************************************/
 
     /**
+     * Create a command submission context.
+     * Various command streams can be submitted to the same context.
+     */
+    struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws);
+
+    /**
+     * Destroy a context.
+     */
+    void (*ctx_destroy)(struct radeon_winsys_ctx *ctx);
+
+    /**
+     * Query a GPU reset status.
+     */
+    enum pipe_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *ctx);
+
+    /**
      * Create a command stream.
      *
-     * \param ws        The winsys this function is called from.
+     * \param ctx       The submission context
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
      * \param trace_buf Trace buffer when tracing is enabled
      */
-    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws,
+    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
@@ -668,12 +685,12 @@ struct radeon_winsys {
 };
 
 
-static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
 {
     cs->buf[cs->cdw++] = value;
 }
 
-static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs,
+static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
 				     const uint32_t *values, unsigned count)
 {
     memcpy(cs->buf+cs->cdw, values, count * 4);
diff --git a/src/gallium/drivers/radeonsi/Automake.inc b/src/gallium/drivers/radeonsi/Automake.inc
index 8686fffd71c..5a9dcfd9fd6 100644
--- a/src/gallium/drivers/radeonsi/Automake.inc
+++ b/src/gallium/drivers/radeonsi/Automake.inc
@@ -5,10 +5,12 @@ TARGET_CPPFLAGS += -DGALLIUM_RADEONSI
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
 	$(RADEON_LIBS) \
-	$(LIBDRM_LIBS)
+	$(LIBDRM_LIBS) \
+	$(AMDGPU_LIBS)
 
 TARGET_RADEON_WINSYS = \
-	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la
+	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \
+	$(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la
 
 TARGET_RADEON_COMMON = \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 2876c0ae735..a0b1414f4bb 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -3,6 +3,7 @@ C_SOURCES := \
 	si_blit.c \
 	si_commands.c \
 	si_compute.c \
+	si_cp_dma.c \
 	si_descriptors.c \
 	sid.h \
 	si_dma.c \
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 86111cb86e8..47b586f171e 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -27,7 +27,7 @@
 
 #include "sid.h"
 #include "si_pipe.h"
-#include "../radeon/r600_cs.h"
+#include "radeon/r600_cs.h"
 
 #include "util/u_format.h"
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 1f2c4082dbc..48972bd170c 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -57,17 +57,19 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
 	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader);
 	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader);
+	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader);
+	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader);
 	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	if (sctx->queued.named.sample_mask) {
 		util_blitter_save_sample_mask(sctx->blitter,
 					      sctx->queued.named.sample_mask->sample_mask);
 	}
-	if (sctx->queued.named.viewport) {
-		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport->viewport);
+	if (sctx->queued.named.viewport[0]) {
+		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport[0]->viewport);
 	}
-	if (sctx->queued.named.scissor) {
-		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor->scissor);
+	if (sctx->queued.named.scissor[0]) {
+		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor[0]->scissor);
 	}
 	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
@@ -146,7 +148,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 				struct pipe_surface *zsurf, *cbsurf, surf_tmpl;
 
 				sctx->dbcb_copy_sample = sample;
-				sctx->db_render_state.dirty = true;
+				si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 				surf_tmpl.format = texture->resource.b.b.format;
 				surf_tmpl.u.tex.level = level;
@@ -180,7 +182,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_blit_decompress_depth_in_place(struct si_context *sctx,
@@ -192,7 +194,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	unsigned layer, max_layer, checked_last_layer, level;
 
 	sctx->db_inplace_flush_enabled = true;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -230,7 +232,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	}
 
 	sctx->db_inplace_flush_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 void si_flush_depth_textures(struct si_context *sctx,
@@ -340,6 +342,8 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		evergreen_do_fast_color_clear(&sctx->b, fb, &sctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {
@@ -374,9 +378,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		}
 
 		zstex->depth_clear_value = depth;
-		sctx->framebuffer.atom.dirty = true; /* updates DB_DEPTH_CLEAR */
+		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
 		sctx->db_depth_clear = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	si_blitter_begin(ctx, SI_CLEAR);
@@ -389,7 +393,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		sctx->db_depth_clear = false;
 		sctx->db_depth_disable_expclear = false;
 		zstex->depth_cleared = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 
@@ -455,89 +459,6 @@ struct texture_orig_info {
 	unsigned npix0_y;
 };
 
-static void si_compressed_to_blittable(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-	unsigned pixsize = util_format_get_blocksize(rtex->resource.b.b.format);
-	int new_format;
-	int new_height, new_width;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	if (pixsize == 8)
-		new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-	else
-		new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-
-	new_width = util_format_get_nblocksx(tex->format, orig->width0);
-	new_height = util_format_get_nblocksy(tex->format, orig->height0);
-
-	tex->width0 = new_width;
-	tex->height0 = new_height;
-	tex->format = new_format;
-	rtex->surface.level[0].npix_x = util_format_get_nblocksx(orig->format, orig->npix0_x);
-	rtex->surface.level[0].npix_y = util_format_get_nblocksy(orig->format, orig->npix0_y);
-	rtex->surface.level[level].npix_x = util_format_get_nblocksx(orig->format, orig->npix_x);
-	rtex->surface.level[level].npix_y = util_format_get_nblocksy(orig->format, orig->npix_y);
-
-	/* By dividing the dimensions by 4, we effectively decrement
-	 * last_level by 2, therefore the last 2 mipmap levels disappear and
-	 * aren't blittable. Note that the last 3 mipmap levels (4x4, 2x2,
-	 * 1x1) have equal slice sizes, which is an important assumption
-	 * for this to work.
-	 *
-	 * In order to make the last 2 mipmap levels blittable, we have to
-	 * add the slice size of the last mipmap level to the texture
-	 * address, so that even though the hw thinks it reads last_level-2,
-	 * it will actually read last_level-1, and if we add the slice size*2,
-	 * it will read last_level. That's how this workaround works.
-	 */
-	if (level > rtex->resource.b.b.last_level-2)
-		rtex->mipmap_shift = level - (rtex->resource.b.b.last_level-2);
-}
-
-static void si_change_format(struct pipe_resource *tex,
-			     unsigned level,
-			     struct texture_orig_info *orig,
-			     enum pipe_format format)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	tex->format = format;
-}
-
-static void si_reset_blittable_to_orig(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	tex->format = orig->format;
-	tex->width0 = orig->width0;
-	tex->height0 = orig->height0;
-	rtex->surface.level[0].npix_x = orig->npix0_x;
-	rtex->surface.level[0].npix_y = orig->npix0_y;
-	rtex->surface.level[level].npix_x = orig->npix_x;
-	rtex->surface.level[level].npix_y = orig->npix_y;
-	rtex->mipmap_shift = 0;
-}
-
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
@@ -547,114 +468,116 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
 	struct pipe_surface *dst_view, dst_templ;
 	struct pipe_sampler_view src_templ, *src_view;
-	struct texture_orig_info orig_info[2];
+	unsigned dst_width, dst_height, src_width0, src_height0;
+	unsigned src_force_level = 0;
 	struct pipe_box sbox, dstbox;
-	boolean restore_orig[2];
 
-	/* Fallback for buffers. */
+	/* Handle buffers first. */
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, false);
 		return;
 	}
 
-	memset(orig_info, 0, sizeof(orig_info));
+	assert(u_max_sample(dst) == u_max_sample(src));
 
 	/* The driver doesn't decompress resources automatically while
 	 * u_blitter is rendering. */
 	si_decompress_subresource(ctx, src, src_level,
 				  src_box->z, src_box->z + src_box->depth - 1);
 
-	restore_orig[0] = restore_orig[1] = FALSE;
+	dst_width = u_minify(dst->width0, dst_level);
+	dst_height = u_minify(dst->height0, dst_level);
+	src_width0 = src->width0;
+	src_height0 = src->height0;
+
+	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+	util_blitter_default_src_texture(&src_templ, src, src_level);
 
 	if (util_format_is_compressed(src->format) &&
 	    util_format_is_compressed(dst->format)) {
-		si_compressed_to_blittable(src, src_level, &orig_info[0]);
-		restore_orig[0] = TRUE;
-		sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-		sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y);
+		unsigned blocksize = util_format_get_blocksize(src->format);
+
+		if (blocksize == 8)
+			src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+		else
+			src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+		dst_templ.format = src_templ.format;
+
+		dst_width = util_format_get_nblocksx(dst->format, dst_width);
+		dst_height = util_format_get_nblocksy(dst->format, dst_height);
+		src_width0 = util_format_get_nblocksx(src->format, src_width0);
+		src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+		dstx = util_format_get_nblocksx(dst->format, dstx);
+		dsty = util_format_get_nblocksy(dst->format, dsty);
+
+		sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+		sbox.y = util_format_get_nblocksy(src->format, src_box->y);
 		sbox.z = src_box->z;
-		sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
-		sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height);
+		sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+		sbox.height = util_format_get_nblocksy(src->format, src_box->height);
 		sbox.depth = src_box->depth;
 		src_box = &sbox;
 
-		si_compressed_to_blittable(dst, dst_level, &orig_info[1]);
-		restore_orig[1] = TRUE;
-		/* translate the dst box as well */
-		dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-		dsty = util_format_get_nblocksy(orig_info[1].format, dsty);
-	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+		src_force_level = src_level;
+	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src) ||
+		   /* also *8_SNORM has precision issues, use UNORM instead */
+		   util_format_is_snorm(src->format)) {
 		if (util_format_is_subsampled_422(src->format)) {
-			/* XXX untested */
-			si_change_format(src, src_level, &orig_info[0],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
-			si_change_format(dst, dst_level, &orig_info[1],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
+			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+			dst_width = util_format_get_nblocksx(dst->format, dst_width);
+			src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+			dstx = util_format_get_nblocksx(dst->format, dstx);
 
 			sbox = *src_box;
-			sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-			sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
+			sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+			sbox.width = util_format_get_nblocksx(src->format, src_box->width);
 			src_box = &sbox;
-			dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		} else {
 			unsigned blocksize = util_format_get_blocksize(src->format);
 
 			switch (blocksize) {
 			case 1:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8_UNORM;
 				break;
 			case 2:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8_UNORM;
 				break;
 			case 4:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
 				break;
 			case 8:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R16G16B16A16_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R16G16B16A16_UINT);
+				dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+				src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
 				break;
 			case 16:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R32G32B32A32_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R32G32B32A32_UINT);
+				dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+				src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
 				break;
 			default:
 				fprintf(stderr, "Unhandled format %s with blocksize %u\n",
 					util_format_short_name(src->format), blocksize);
 				assert(0);
 			}
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		}
 	}
 
 	/* Initialize the surface. */
-	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
 	dst_view = r600_create_surface_custom(ctx, dst, &dst_templ,
-					      rdst->surface.level[dst_level].npix_x,
-					      rdst->surface.level[dst_level].npix_y);
+					      dst_width, dst_height);
 
 	/* Initialize the sampler view. */
-	util_blitter_default_src_texture(&src_templ, src, src_level);
-	src_view = ctx->create_sampler_view(ctx, src, &src_templ);
+	src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
+						 src_width0, src_height0,
+						 src_force_level);
 
 	u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
 		 abs(src_box->depth), &dstbox);
@@ -662,18 +585,12 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	/* Copy. */
 	si_blitter_begin(ctx, SI_COPY);
 	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
-				  src_view, src_box, src->width0, src->height0,
+				  src_view, src_box, src_width0, src_height0,
 				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
 	si_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
 	pipe_sampler_view_reference(&src_view, NULL);
-
-	if (restore_orig[0])
-		si_reset_blittable_to_orig(src, src_level, &orig_info[0]);
-
-	if (restore_orig[1])
-		si_reset_blittable_to_orig(dst, dst_level, &orig_info[1]);
 }
 
 /* For MSAA integer resolving to work, we change the format to NORM using this function. */
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 89bef2e7afd..d4fe5653687 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -137,14 +137,14 @@ static void *si_create_compute_state(
 	}
 #else
 
-	radeon_elf_read(code, header->num_bytes, &program->shader.binary, true);
+	radeon_elf_read(code, header->num_bytes, &program->shader.binary);
 
 	/* init_scratch_buffer patches the shader code with the scratch address,
 	 * so we need to call it before si_shader_binary_read() which uploads
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader, &program->shader.binary);
+	si_shader_binary_read(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
@@ -309,8 +309,6 @@ static void si_launch_grid(
 			kernel_args[i]);
 	}
 
-	sctx->b.ws->buffer_unmap(input_buffer->cs_buf);
-
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
new file mode 100644
index 00000000000..f8a9da45a10
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Marek Olšák <[email protected]>
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+#include "radeon/r600_cs.h"
+
+
+/* Set this if you want the 3D engine to wait until CP DMA is done.
+ * It should be set on the last CP DMA packet. */
+#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
+
+/* Set this if the source data was used as a destination in a previous CP DMA
+ * packet. It's for preventing a read-after-write (RAW) hazard between two
+ * CP DMA packets. */
+#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
+#define CIK_CP_DMA_USE_L2	(1 << 2)
+
+/* Emit a CP DMA packet to do a copy from one buffer to another.
+ * The size must fit in bits [20:0].
+ */
+static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
+				       uint64_t dst_va, uint64_t src_va,
+				       unsigned size, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
+			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
+		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
+static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
+					uint64_t dst_va, unsigned size,
+					uint32_t clear_value, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, 0);
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* The max number of bytes to copy per packet. */
+#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+
+static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+			    unsigned offset, unsigned size, unsigned value,
+			    bool is_framebuffer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
+		       offset + size);
+
+	/* Fallback for unaligned clears. */
+	if (offset % 4 != 0 || size % 4 != 0) {
+		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						       sctx->b.rings.gfx.cs,
+						       PIPE_TRANSFER_WRITE);
+		size /= 4;
+		for (unsigned i = 0; i < size; i++)
+			*map++ = value;
+		return;
+	}
+
+	uint64_t va = r600_resource(dst)->gpu_address + offset;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+		unsigned dma_flags = tc_l2_flag;
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
+				 FALSE);
+
+		/* This must be done after need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
+				      RADEON_PRIO_MIN);
+
+		/* Flush the caches for the first copy only.
+		 * Also wait for the previous CP DMA operations. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count)
+			dma_flags |= R600_CP_DMA_SYNC;
+
+		/* Emit the clear packet. */
+		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
+
+		size -= byte_count;
+		va += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer)
+{
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
+		       dst_offset + size);
+
+	dst_offset += r600_resource(dst)->gpu_address;
+	src_offset += r600_resource(src)->gpu_address;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned sync_flags = tc_l2_flag;
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
+
+		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			sync_flags |= SI_CP_DMA_RAW_WAIT;
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count) {
+			sync_flags |= R600_CP_DMA_SYNC;
+		}
+
+		/* This must be done after r600_need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
+				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
+				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+
+		size -= byte_count;
+		src_offset += byte_count;
+		dst_offset += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_init_cp_dma_functions(struct si_context *sctx)
+{
+	sctx->b.clear_buffer = si_clear_buffer;
+}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bbfd36dcbeb..890be071596 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -24,14 +24,23 @@
  *      Marek Olšák <[email protected]>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -64,284 +72,62 @@ static uint32_t null_descriptor[8] = {
 	 * descriptor */
 };
 
-/* Set this if you want the 3D engine to wait until CP DMA is done.
- * It should be set on the last CP DMA packet. */
-#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
-
-/* Set this if the source data was used as a destination in a previous CP DMA
- * packet. It's for preventing a read-after-write (RAW) hazard between two
- * CP DMA packets. */
-#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
-#define CIK_CP_DMA_USE_L2	(1 << 2)
-
-/* Emit a CP DMA packet to do a copy from one buffer to another.
- * The size must fit in bits [20:0].
- */
-static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
-				       uint64_t dst_va, uint64_t src_va,
-				       unsigned size, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
-			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
-		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
-/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
-static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
-					uint64_t dst_va, unsigned size,
-					uint32_t clear_value, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
-static void si_init_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				unsigned shader_userdata_reg,
+static void si_init_descriptors(struct si_descriptors *desc,
+				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements,
-				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+				unsigned num_elements)
 {
+	int i;
+
 	assert(num_elements <= sizeof(desc->enabled_mask)*8);
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
-	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_reg = shader_userdata_reg;
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
-	desc->context_size = num_elements * element_dw_size * 4;
-
-	desc->buffer = (struct r600_resource*)
-		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT,
-				   SI_NUM_CONTEXTS * desc->context_size);
-
-	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	/* We don't check for CS space here, because this should be called
-	 * only once at context initialization. */
-	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-				    desc->buffer->b.b.width0, 0,
-				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+	desc->list_dirty = true; /* upload the list before the next draw */
+	desc->shader_userdata_offset = shader_userdata_index * 4;
+
+	/* Initialize the array to NULL descriptors if the element size is 8. */
+	if (element_dw_size == 8)
+		for (i = 0; i < num_elements; i++)
+			memcpy(desc->list + i*element_dw_size, null_descriptor,
+			       sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+	FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc)
 {
-	if (desc->dirty_mask) {
-		desc->atom.num_dw =
-			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
-			4; /* pointer update */
-
-		if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-		    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
-			desc->atom.num_dw += 4; /* second pointer update */
-
-		desc->atom.dirty = true;
-
-		/* TODO: Investigate if these flushes can be removed after
-		 * adding CE support. */
-
-		/* The descriptors are read with the K cache. */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
-
-		/* Since SI uses uncached CP DMA to update descriptors,
-		 * we have to flush TC L2, which is used to fetch constants
-		 * along with KCACHE. */
-		if (sctx->b.chip_class == SI)
-			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-	} else {
-		desc->atom.dirty = false;
-	}
-}
+	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	void *ptr;
 
-static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct r600_atom *atom)
-{
-	struct si_descriptors *desc = (struct si_descriptors*)atom;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va = desc->buffer->gpu_address +
-		      desc->current_context_id * desc->context_size +
-		      desc->buffer_offset;
+	if (!desc->list_dirty)
+		return true;
 
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
+	u_upload_alloc(sctx->b.uploader, 0, list_size,
+		       &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, &ptr);
+	if (!desc->buffer)
+		return false; /* skip the draw call */
 
-	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-	    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-		radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-		radeon_emit(cs, (desc->shader_userdata_reg +
-				 (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
-				  R_00B130_SPI_SHADER_USER_DATA_VS_0) -
-				 SI_SH_REG_OFFSET) >> 2);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-	}
-}
+	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-static void si_emit_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				uint32_t **descriptors)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va_base;
-	int packet_start = 0;
-	int packet_size = 0;
-	int last_index = desc->num_elements; /* point to a non-existing element */
-	uint64_t dirty_mask = desc->dirty_mask;
-	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-	assert(dirty_mask);
-
-	va_base = desc->buffer->gpu_address;
-
-	/* Copy the descriptors to a new context slot. */
-	si_emit_cp_dma_copy_buffer(sctx,
-				   va_base + new_context_id * desc->context_size,
-				   va_base + desc->current_context_id * desc->context_size,
-				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-	va_base += new_context_id * desc->context_size;
-
-	/* Update the descriptors.
-	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-	 *
-	 * XXX When unbinding lots of resources, consider clearing the memory
-	 *     with CP DMA instead of emitting zeros.
-	 */
-	while (dirty_mask) {
-		int i = u_bit_scan64(&dirty_mask);
-
-		assert(i < desc->num_elements);
-
-		if (last_index+1 == i && packet_size) {
-			/* Append new data at the end of the last packet. */
-			packet_size += desc->element_dw_size;
-			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-		} else {
-			/* Start a new packet. */
-			uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-			packet_start = cs->cdw;
-			packet_size = 2 + desc->element_dw_size;
-
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-					     PKT3_WRITE_DATA_WR_CONFIRM |
-					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-			radeon_emit(cs, va & 0xFFFFFFFFUL);
-			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-		}
-
-		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
-
-		last_index = i;
-	}
-
-	desc->dirty_mask = 0;
-	desc->current_context_id = new_context_id;
-
-	/* Now update the shader userdata pointer. */
-	si_emit_shader_pointer(sctx, &desc->atom);
-}
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+			      RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-static unsigned si_get_shader_user_data_base(unsigned shader)
-{
-	switch (shader) {
-	case PIPE_SHADER_VERTEX:
-		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
-	case PIPE_SHADER_GEOMETRY:
-		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
-	case PIPE_SHADER_FRAGMENT:
-		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
-	default:
-		assert(0);
-		return 0;
-	}
+	desc->list_dirty = false;
+	desc->pointer_dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+	return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-	si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views,
-				  unsigned shader)
-{
-	int i;
-
-	si_init_descriptors(sctx, &views->desc,
-			    si_get_shader_user_data_base(shader) +
-			    SI_SGPR_RESOURCE * 4,
-			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-	for (i = 0; i < views->desc.num_elements; i++) {
-		views->desc_data[i] = null_descriptor;
-		views->desc.dirty_mask |= 1llu << i;
-	}
-	si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
 	int i;
@@ -382,10 +168,10 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 				      si_get_resource_ro_priority(rview->resource));
 	}
 
+	if (!views->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -406,17 +192,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				si_get_resource_ro_priority(rview->resource));
 
-
 		pipe_sampler_view_reference(&views->views[slot], view);
-		views->desc_data[slot] = view_desc;
+		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		views->desc_data[slot] = null_descriptor;
+		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	views->desc.dirty_mask |= 1llu << slot;
+	views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -475,25 +260,17 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 					    NULL, NULL);
 		}
 	}
-
-	si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-	si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 					   struct si_sampler_states *states)
 {
+	if (!states->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-	si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -513,66 +290,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
 
-		if (!sstates[i]) {
-			samplers->desc.dirty_mask &= ~(1llu << slot);
+		if (!sstates[i])
 			continue;
-		}
 
-		samplers->desc_data[slot] = sstates[i]->val;
-		samplers->desc.dirty_mask |= 1llu << slot;
+		memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+		samplers->desc.list_dirty = true;
 	}
-
-	si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-				     struct si_buffer_resources *buffers,
-				     unsigned num_buffers, unsigned shader,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
 {
-	int i;
-
-	buffers->num_buffers = num_buffers;
 	buffers->shader_usage = shader_usage;
 	buffers->priority = priority;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
-
-	/* si_emit_descriptors only accepts an array of arrays.
-	 * This adds such an array. */
-	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-	for (i = 0; i < num_buffers; i++) {
-		buffers->desc_data[i] = &buffers->desc_storage[i*4];
-	}
 
-	si_init_descriptors(sctx, &buffers->desc,
-			    si_get_shader_user_data_base(shader) +
-			    shader_userdata_index*4, 4, num_buffers,
-			    si_emit_buffer_resources);
+	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+			    num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
 	int i;
 
-	for (i = 0; i < buffers->num_buffers; i++) {
+	for (i = 0; i < buffers->desc.num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
 
 	FREE(buffers->buffers);
-	FREE(buffers->desc_storage);
-	FREE(buffers->desc_data);
 	si_release_descriptors(&buffers->desc);
 }
 
@@ -590,11 +340,11 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 				      buffers->shader_usage, buffers->priority);
 	}
 
+	if (!buffers->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
 /* VERTEX BUFFERS */
@@ -617,14 +367,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 	}
+
+	if (!desc->buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &desc->atom);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -632,8 +383,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	uint64_t va;
 	uint32_t *ptr;
 
+	if (!sctx->vertex_buffers_dirty)
+		return true;
 	if (!count || !sctx->vertex_elements)
-		return;
+		return true;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
@@ -641,13 +394,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 */
 	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+	if (!desc->buffer)
+		return false;
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 
 	assert(count <= SI_NUM_VERTEX_BUFFERS);
-	assert(desc->current_context_id == 0);
 
 	for (i = 0; i < count; i++) {
 		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -675,7 +429,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		desc[0] = va & 0xFFFFFFFF;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vb->stride);
-		if (vb->stride)
+
+		if (sctx->b.chip_class <= CIK && vb->stride)
 			/* Round up by rounding down and adding 1 */
 			desc[2] = (vb->buffer->width0 - offset -
 				   sctx->vertex_elements->format_size[i]) /
@@ -693,13 +448,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		}
 	}
 
-	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
-	desc->atom.dirty = true;
-
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
+	desc->pointer_dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+	sctx->vertex_buffers_dirty = false;
+	return true;
 }
 
 
@@ -724,7 +480,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -751,7 +507,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(0);
@@ -770,12 +526,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -784,7 +539,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride)
+			unsigned element_size, unsigned index_stride, uint64_t offset)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
@@ -795,13 +550,13 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	/* The stride field in the resource descriptor has 14 bits */
 	assert(stride < (1 << 14));
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	if (buffer) {
 		uint64_t va;
 
-		va = r600_resource(buffer)->gpu_address;
+		va = r600_resource(buffer)->gpu_address + offset;
 
 		switch (element_size) {
 		default:
@@ -839,8 +594,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			break;
 		}
 
+		if (sctx->b.chip_class >= VI && stride)
+			num_records *= stride;
+
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(stride) |
@@ -863,12 +621,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -929,15 +686,21 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			struct pipe_resource *buffer = targets[i]->buffer;
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
-			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[bufidx];
+			/* Set the descriptor.
+			 *
+			 * On VI, the format must be non-INVALID, otherwise
+			 * the buffer will be considered not bound and store
+			 * instructions will be no-ops.
+			 */
+			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
 			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
@@ -948,24 +711,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			buffers->desc.enabled_mask |= 1llu << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[bufidx], 0,
+			memset(buffers->desc.list + bufidx*4, 0,
 			       sizeof(uint32_t) * 4);
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						NULL);
 			buffers->desc.enabled_mask &= ~(1llu << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
 		bufidx = SI_SO_BUF_OFFSET + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 		buffers->desc.enabled_mask &= ~(1llu << bufidx);
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -1034,22 +795,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Read/Write buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
-
 				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
 					/* Update the streamout state. */
 					if (sctx->b.streamout.begin_emitted) {
@@ -1061,34 +819,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 				}
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Constant buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
-
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1100,223 +849,211 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Texture buffers - update bindings. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_sampler_views *views = &sctx->samplers[shader].views;
-		bool found = false;
 		uint64_t mask = views->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (views->views[i]->texture == buf) {
+				si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+							    old_va, buf);
+				views->desc.list_dirty = true;
+
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SHADER_BUFFER_RO);
-
-				views->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &views->desc);
-		}
 	}
 }
 
-/* CP DMA */
-
-/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+/* SHADER USER DATA */
 
-static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
-			    unsigned offset, unsigned size, unsigned value,
-			    bool is_framebuffer)
+static void si_mark_shader_pointers_dirty(struct si_context *sctx,
+					  unsigned shader)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
+	sctx->const_buffers[shader].desc.pointer_dirty = true;
+	sctx->rw_buffers[shader].desc.pointer_dirty = true;
+	sctx->samplers[shader].views.desc.pointer_dirty = true;
+	sctx->samplers[shader].states.desc.pointer_dirty = true;
 
-	if (!size)
-		return;
+	if (shader == PIPE_SHADER_VERTEX)
+		sctx->vertex_buffers.pointer_dirty = true;
 
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
-		       offset + size);
-
-	/* Fallback for unaligned clears. */
-	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
-		return;
-	}
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+}
 
-	uint64_t va = r600_resource(dst)->gpu_address + offset;
+static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
+{
+	int i;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		si_mark_shader_pointers_dirty(sctx, i);
 	}
+}
 
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
-
-	while (size) {
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-		unsigned dma_flags = tc_l2_flag;
-
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
-				 FALSE);
-
-		/* This must be done after need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_MIN);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
+/* Set a base register address for user data constants in the given shader.
+ * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
+ */
+static void si_set_user_data_base(struct si_context *sctx,
+				  unsigned shader, uint32_t new_base)
+{
+	uint32_t *base = &sctx->shader_userdata.sh_base[shader];
 
-		/* Emit the clear packet. */
-		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
+	if (*base != new_base) {
+		*base = new_base;
 
-		size -= byte_count;
-		va += byte_count;
+		if (new_base)
+			si_mark_shader_pointers_dirty(sctx, shader);
 	}
-
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
-
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
 }
 
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    bool is_framebuffer)
+/* This must be called when these shaders are changed from non-NULL to NULL
+ * and vice versa:
+ * - geometry shader
+ * - tessellation control shader
+ * - tessellation evaluation shader
+ */
+void si_shader_change_notify(struct si_context *sctx)
 {
-	unsigned flush_flags, tc_l2_flag;
-
-	if (!size)
-		return;
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
-		       dst_offset + size);
-
-	dst_offset += r600_resource(dst)->gpu_address;
-	src_offset += r600_resource(src)->gpu_address;
-
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
+	/* VS can be bound as VS, ES, or LS. */
+	if (sctx->tes_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
+	else if (sctx->gs_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+	else
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+
+	/* TES can be bound as ES, VS, or not bound. */
+	if (sctx->tes_shader) {
+		if (sctx->gs_shader)
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+		else
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
 	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+		si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
 	}
+}
 
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+static void si_emit_shader_pointer(struct si_context *sctx,
+				   struct si_descriptors *desc,
+				   unsigned sh_base, bool keep_dirty)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint64_t va;
 
-	while (size) {
-		unsigned sync_flags = tc_l2_flag;
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+	if (!desc->pointer_dirty || !desc->buffer)
+		return;
 
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
+	va = desc->buffer->gpu_address +
+	     desc->buffer_offset;
 
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
+	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
 
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
+	desc->pointer_dirty = keep_dirty;
+}
 
-		/* This must be done after r600_need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+static void si_emit_shader_userdata(struct si_context *sctx,
+				    struct r600_atom *atom)
+{
+	unsigned i;
+	uint32_t *sh_base = sctx->shader_userdata.sh_base;
+
+	if (sctx->gs_shader) {
+		/* The VS copy shader needs these for clipping, streamout, and rings. */
+		unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+		unsigned i = PIPE_SHADER_VERTEX;
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
+
+		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
+				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
+	} else if (sctx->tes_shader) {
+		/* The TESSEVAL shader needs this for streamout. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
+				       R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
+	}
 
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		unsigned base = sh_base[i];
 
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
-	}
+		if (!base)
+			continue;
 
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
+		if (i != PIPE_SHADER_TESS_EVAL)
+			si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
 
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
+	}
+	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
 
-/* INIT/DEINIT */
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
+		si_init_buffer_resources(&sctx->const_buffers[i],
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
-					 i == PIPE_SHADER_VERTEX ?
-					 SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
-					 i, SI_SGPR_RW_BUFFERS,
+		si_init_buffer_resources(&sctx->rw_buffers[i],
+					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
-
-		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
-				    4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
-
-		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+		si_init_descriptors(&sctx->samplers[i].views.desc,
+				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+		si_init_descriptors(&sctx->samplers[i].states.desc,
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers,
-			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
-			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
-			    si_emit_shader_pointer);
-	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
-	sctx->b.clear_buffer = si_clear_buffer;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
+
+	/* Shader user data. */
+	sctx->atoms.s.shader_userdata = &sctx->shader_userdata.atom;
+	sctx->shader_userdata.atom.emit = (void*)si_emit_shader_userdata;
+
+	/* Upper bound, 4 pointers per shader, +1 for vertex buffers, +2 for the VS copy shader. */
+	sctx->shader_userdata.atom.num_dw = (SI_NUM_SHADERS * 4 + 1 + 2) * 4;
+
+	/* Set default and immutable mappings. */
+	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+}
+
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+			return false;
+	}
+	return si_upload_vertex_buffer_descriptors(sctx);
 }
 
 void si_release_all_descriptors(struct si_context *sctx)
@@ -1343,4 +1080,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
+	si_shader_userdata_begin_new_cs(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 313ced7f5d1..307dc391431 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -30,10 +30,32 @@
 void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 	int i;
 
+	/* If the CS is sufficiently large, don't count the space needed
+	 * and just flush if there is less than 8096 dwords left. */
+	if (cs->max_dw >= 24 * 1024) {
+		if (cs->cdw > cs->max_dw - 8 * 1024)
+			ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+
+	/* There are two memory usage counters in the winsys for all buffers
+	 * that have been added (cs_add_reloc) and two counters in the pipe
+	 * driver for those that haven't been added yet.
+	 * */
+	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
+		ctx->b.gtt = 0;
+		ctx->b.vram = 0;
+		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+	ctx->b.gtt = 0;
+	ctx->b.vram = 0;
+
 	/* The number of dwords we already used in the CS so far. */
-	num_dw += ctx->b.rings.gfx.cs->cdw;
+	num_dw += cs->cdw;
 
 	if (count_draw_in) {
 		for (i = 0; i < SI_NUM_ATOMS(ctx); i++) {
@@ -50,7 +72,8 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
@@ -72,7 +95,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 #endif
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
@@ -82,9 +105,16 @@ void si_context_gfx_flush(void *context, unsigned flags,
 {
 	struct si_context *ctx = context;
 	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
-
-	if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence)
+	struct radeon_winsys *ws = ctx->b.ws;
+
+	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
+	    (!fence || ctx->last_gfx_fence)) {
+		if (fence)
+			ws->fence_reference(fence, ctx->last_gfx_fence);
+		if (!(flags & RADEON_FLUSH_ASYNC))
+			ws->cs_sync_flush(cs);
 		return;
+	}
 
 	ctx->b.rings.gfx.flushing = true;
 
@@ -101,9 +131,13 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 
 	/* Flush the CS. */
-	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
+		     ctx->screen->b.cs_count++);
 	ctx->b.rings.gfx.flushing = false;
 
+	if (fence)
+		ws->fence_reference(fence, ctx->last_gfx_fence);
+
 #if SI_TRACE_CS
 	if (ctx->screen->b.trace_bo) {
 		struct si_screen *sscreen = ctx->screen;
@@ -111,7 +145,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
 		for (i = 0; i < 10; i++) {
 			usleep(5);
-			if (!ctx->b.ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
+			if (!ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
 				break;
 			}
 		}
@@ -130,7 +164,8 @@ void si_context_gfx_flush(void *context, unsigned flags,
 void si_begin_new_cs(struct si_context *ctx)
 {
 	/* Flush read caches at the beginning of CS. */
-	ctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
+	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+			SI_CONTEXT_INV_TC_L1 |
 			SI_CONTEXT_INV_TC_L2 |
 			SI_CONTEXT_INV_KCACHE |
 			SI_CONTEXT_INV_ICACHE;
@@ -143,24 +178,32 @@ void si_begin_new_cs(struct si_context *ctx)
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
 
-	ctx->clip_regs.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->msaa_sample_locs.dirty = true;
-	ctx->msaa_config.dirty = true;
-	ctx->db_render_state.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	si_mark_atom_dirty(ctx, &ctx->clip_regs);
+	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs);
+	si_mark_atom_dirty(ctx, &ctx->msaa_config);
+	si_mark_atom_dirty(ctx, &ctx->db_render_state);
+	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	r600_postflush_resume_features(&ctx->b);
 
 	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+
+	/* Invalidate various draw states so that they are emitted before
+	 * the first draw call. */
 	si_invalidate_draw_sh_constants(ctx);
 	ctx->last_primitive_restart_en = -1;
 	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
 	ctx->last_gs_out_prim = -1;
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
+	ctx->last_ls_hs_config = -1;
 	ctx->last_rast_prim = -1;
 	ctx->last_sc_line_stipple = ~0;
 	ctx->emit_scratch_reloc = true;
+	ctx->last_ls = NULL;
+	ctx->last_tcs = NULL;
+	ctx->last_tes_sh_base = -1;
+	ctx->last_num_tcs_input_cp = -1;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 53ae71a8c92..473a2e9ad12 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -36,32 +36,42 @@
 static void si_destroy_context(struct pipe_context *context)
 {
 	struct si_context *sctx = (struct si_context *)context;
+	int i;
 
 	si_release_all_descriptors(sctx);
 
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
+	pipe_resource_reference(&sctx->tf_ring, NULL);
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_table, NULL);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_on);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_off);
+	si_pm4_delete_state(sctx, tf_ring, sctx->tf_state);
+	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
+		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
 	if (sctx->pstipple_sampler_state)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
-	if (sctx->dummy_pixel_shader) {
+	if (sctx->dummy_pixel_shader)
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
-	}
-	sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
+	if (sctx->fixed_func_tcs_shader)
+		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
+	if (sctx->custom_dsa_flush)
+		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
+	if (sctx->custom_blend_resolve)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
+	if (sctx->custom_blend_decompress)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
+	if (sctx->custom_blend_fastclear)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
 	util_unreference_framebuffer_state(&sctx->framebuffer.state);
 
-	util_blitter_destroy(sctx->blitter);
+	if (sctx->blitter)
+		util_blitter_destroy(sctx->blitter);
 
 	si_pm4_cleanup(sctx);
 
@@ -74,6 +84,14 @@ static void si_destroy_context(struct pipe_context *context)
 	FREE(sctx);
 }
 
+static enum pipe_reset_status
+si_amdgpu_get_reset_status(struct pipe_context *ctx)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	return sctx->b.ws->ctx_query_reset_status(sctx->b.ctx);
+}
+
 static struct pipe_context *si_create_context(struct pipe_screen *screen, void *priv)
 {
 	struct si_context *sctx = CALLOC_STRUCT(si_context);
@@ -91,13 +109,18 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->b.b.screen = screen; /* this must be set first */
 	sctx->b.b.priv = priv;
 	sctx->b.b.destroy = si_destroy_context;
+	sctx->b.set_atom_dirty = (void *)si_set_atom_dirty;
 	sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
 
 	if (!r600_common_context_init(&sctx->b, &sscreen->b))
 		goto fail;
 
+	if (sscreen->b.info.drm_major == 3)
+		sctx->b.b.get_device_reset_status = si_amdgpu_get_reset_status;
+
 	si_init_blit_functions(sctx);
 	si_init_compute_functions(sctx);
+	si_init_cp_dma_functions(sctx);
 
 	if (sscreen->b.info.has_uvd) {
 		sctx->b.b.create_video_codec = si_uvd_create_decoder;
@@ -107,7 +130,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX, si_context_gfx_flush,
+	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
 					     sctx, sscreen->b.trace_bo ?
 						sscreen->b.trace_bo->cs_buf : NULL);
 	sctx->b.rings.gfx.flush = si_context_gfx_flush;
@@ -127,17 +150,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->atoms.s.streamout_begin = &sctx->b.streamout.begin_atom;
 	sctx->atoms.s.streamout_enable = &sctx->b.streamout.enable_atom;
 
-	switch (sctx->b.chip_class) {
-	case SI:
-	case CIK:
-		si_init_state_functions(sctx);
-		si_init_shader_functions(sctx);
-		si_init_config(sctx);
-		break;
-	default:
-		R600_ERR("Unsupported chip class %d.\n", sctx->b.chip_class);
-		goto fail;
-	}
+	si_init_state_functions(sctx);
+	si_init_shader_functions(sctx);
 
 	if (sscreen->b.debug_flags & DBG_FORCE_DMA)
 		sctx->b.b.resource_copy_region = sctx->b.dma_copy;
@@ -181,7 +195,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	r600_target = radeon_llvm_get_r600_target(triple);
 	sctx->tm = LLVMCreateTargetMachine(r600_target, triple,
 					   r600_get_llvm_processor_name(sscreen->b.family),
-					   "+DumpCode,+vgpr-spilling",
+					   sctx->b.chip_class >= VI ?
+						   "+DumpCode" :
+						   "+DumpCode,+vgpr-spilling",
 					   LLVMCodeGenLevelDefault,
 					   LLVMRelocDefault,
 					   LLVMCodeModelDefault);
@@ -252,15 +268,27 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
 	case PIPE_CAP_TGSI_TEXCOORD:
+	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 43) ||
+		       sscreen->b.info.drm_major == 3;
+
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 		/* 2D tiling on CIK is supported since DRM 2.35.0 */
 		return sscreen->b.chip_class < CIK ||
-		       sscreen->b.info.drm_minor >= 35;
+		       (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 35) ||
+		       sscreen->b.info.drm_major == 3;
 
         case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
                 return R600_MAP_BUFFER_ALIGNMENT;
@@ -270,7 +298,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 4;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		return 330;
+		return HAVE_LLVM >= 0x0307 ? 410 : 330;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF);
@@ -289,13 +317,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+		return 30;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
 
@@ -314,7 +342,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
 		return 4095;
 	case PIPE_CAP_MAX_VERTEX_STREAMS:
-		return 1;
+		return 4;
 
 	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
 		return 2048;
@@ -335,7 +363,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 8;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
-		return 1;
+		return 16;
 
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIMESTAMP:
@@ -375,6 +403,13 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_GEOMETRY:
 		break;
+	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
+		/* LLVM 3.6.2 is required for tessellation because of bug fixes there */
+		if (HAVE_LLVM < 0x0306 ||
+		    (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 2))
+			return 0;
+		break;
 	case PIPE_SHADER_COMPUTE:
 		switch (param) {
 		case PIPE_SHADER_CAP_PREFERRED_IR:
@@ -401,7 +436,6 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		}
 		break;
 	default:
-		/* TODO: support tessellation */
 		return 0;
 	}
 
@@ -433,7 +467,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		/* Indirection of geometry shader input dimension is not
 		 * handled yet
 		 */
-		return shader < PIPE_SHADER_GEOMETRY;
+		return shader != PIPE_SHADER_GEOMETRY;
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
@@ -448,6 +482,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_DOUBLES:
+		return HAVE_LLVM >= 0x0307;
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 		return 0;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2d67342f160..553e1f32683 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -48,7 +48,8 @@
 
 #define SI_MAX_DRAW_CS_DWORDS \
 	(/*scratch:*/ 3 + /*derived prim state:*/ 3 + \
-	 /*draw regs:*/ 16 + /*draw packets:*/ 31)
+	 /*draw regs:*/ 18 + /*draw packets:*/ 31 +\
+	 /*derived tess state:*/ 19)
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
@@ -125,8 +126,6 @@ struct si_framebuffer {
 
 #define SI_NUM_ATOMS(sctx) (sizeof((sctx)->atoms)/sizeof((sctx)->atoms.array[0]))
 
-#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
-
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -137,17 +136,12 @@ struct si_context {
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct si_pm4_state		*init_config;
+	struct pipe_fence_handle	*last_gfx_fence;
+	struct si_shader_selector	*fixed_func_tcs_shader;
 
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *vertex_buffers;
-			struct r600_atom *const_buffers[SI_NUM_SHADERS];
-			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *sampler_states[SI_NUM_SHADERS];
-			/* Caches must be flushed after resource descriptors are
-			 * updated in memory. */
 			struct r600_atom *cache_flush;
 			struct r600_atom *streamout_begin;
 			struct r600_atom *streamout_enable; /* must be after streamout_begin */
@@ -156,6 +150,7 @@ struct si_context {
 			struct r600_atom *db_render_state;
 			struct r600_atom *msaa_config;
 			struct r600_atom *clip_regs;
+			struct r600_atom *shader_userdata;
 		} s;
 		struct r600_atom *array[0];
 	} atoms;
@@ -168,7 +163,10 @@ struct si_context {
 	struct si_shader_selector	*ps_shader;
 	struct si_shader_selector	*gs_shader;
 	struct si_shader_selector	*vs_shader;
+	struct si_shader_selector	*tcs_shader;
+	struct si_shader_selector	*tes_shader;
 	struct si_cs_shader_state	cs_shader_state;
+	struct si_shader_data		shader_userdata;
 	/* shader information */
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
@@ -194,13 +192,16 @@ struct si_context {
 	/* With rasterizer discard, there doesn't have to be a pixel shader.
 	 * In that case, we bind this one: */
 	void			*dummy_pixel_shader;
-	struct si_pm4_state	*gs_on;
-	struct si_pm4_state	*gs_off;
-	struct si_pm4_state	*gs_rings;
 	struct r600_atom	cache_flush;
 	struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
+
+	/* VGT states. */
+	struct si_pm4_state	*vgt_shader_config[4];
+	struct si_pm4_state	*gs_rings;
 	struct pipe_resource	*esgs_ring;
 	struct pipe_resource	*gsvs_ring;
+	struct si_pm4_state	*tf_state;
+	struct pipe_resource	*tf_ring;
 
 	LLVMTargetMachineRef		tm;
 
@@ -218,7 +219,7 @@ struct si_context {
 	bool			db_depth_disable_expclear;
 	unsigned		ps_db_shader_control;
 
-	/* Draw state. */
+	/* Emitted draw state. */
 	int			last_base_vertex;
 	int			last_start_instance;
 	int			last_sh_base_reg;
@@ -227,6 +228,7 @@ struct si_context {
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
+	int			last_ls_hs_config;
 	int			last_rast_prim;
 	unsigned		last_sc_line_stipple;
 	int			current_rast_prim; /* primitive type after TES, GS */
@@ -235,6 +237,12 @@ struct si_context {
 	boolean                 emit_scratch_reloc;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
+
+	/* Emitted derived tessellation state. */
+	struct si_shader	*last_ls; /* local shader (VS) */
+	struct si_shader_selector *last_tcs;
+	int			last_num_tcs_input_cp;
+	int			last_tes_sh_base;
 };
 
 /* cik_sdma.c */
@@ -260,6 +268,13 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
 
+/* si_cp_dma.c */
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer);
+void si_init_cp_dma_functions(struct si_context *sctx);
+
 /* si_dma.c */
 void si_dma_copy(struct pipe_context *ctx,
 		 struct pipe_resource *dst,
@@ -293,7 +308,7 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
  * common helpers
  */
 
-static INLINE struct r600_resource *
+static inline struct r600_resource *
 si_resource_create_custom(struct pipe_screen *screen,
 			  unsigned usage, unsigned size)
 {
@@ -302,7 +317,7 @@ si_resource_create_custom(struct pipe_screen *screen,
 		PIPE_BIND_CUSTOM, usage, size));
 }
 
-static INLINE void
+static inline void
 si_invalidate_draw_sh_constants(struct si_context *sctx)
 {
 	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
@@ -310,4 +325,18 @@ si_invalidate_draw_sh_constants(struct si_context *sctx)
 	sctx->last_sh_base_reg = -1; /* reset to an unknown value */
 }
 
+static inline void
+si_set_atom_dirty(struct si_context *sctx,
+		  struct r600_atom *atom, bool dirty)
+{
+	atom->dirty = dirty;
+}
+
+static inline void
+si_mark_atom_dirty(struct si_context *sctx,
+		   struct r600_atom *atom)
+{
+	si_set_atom_dirty(sctx, atom, true);
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 47e5f96cbed..4288e9b2ab1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -31,6 +31,7 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_flow.h"
 #include "radeon/r600_cs.h"
 #include "radeon/radeon_llvm.h"
@@ -71,18 +72,25 @@ struct si_shader_context
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
 	int param_vertex_id;
+	int param_rel_auto_id;
+	int param_vs_prim_id;
 	int param_instance_id;
+	int param_tes_u;
+	int param_tes_v;
+	int param_tes_rel_patch_id;
+	int param_tes_patch_id;
+	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
 	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef ddxy_lds;
+	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
 	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
-	LLVMValueRef gsvs_ring;
-	LLVMValueRef gs_next_vertex;
+	LLVMValueRef gsvs_ring[4];
+	LLVMValueRef gs_next_vertex[4];
 };
 
 static struct si_shader_context * si_shader_context(
@@ -129,12 +137,29 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 		assert(index <= 1);
 		return 2 + index;
 	case TGSI_SEMANTIC_GENERIC:
-		assert(index <= 63-4);
-		return 4 + index;
+		if (index <= 63-4)
+			return 4 + index;
+		else
+			/* same explanation as in the default statement,
+			 * the only user hitting this is st/nine.
+			 */
+			return 0;
+
+	/* patch indices are completely separate and thus start from 0 */
+	case TGSI_SEMANTIC_TESSOUTER:
+		return 0;
+	case TGSI_SEMANTIC_TESSINNER:
+		return 1;
+	case TGSI_SEMANTIC_PATCH:
+		return 2 + index;
 
 	default:
-		assert(0);
-		return 63;
+		/* Don't fail here. The result of this function is only used
+		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
+		 * occur, but this function is called for all vertex shaders
+		 * before it's known whether LS will be compiled or not.
+		 */
+		return 0;
 	}
 }
 
@@ -205,6 +230,136 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 	return value;
 }
 
+static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
+{
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8);
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_rel_patch_id);
+
+	default:
+		assert(0);
+		return NULL;
+	}
+}
+
+/* Tessellation shaders pass outputs to the next shader using LDS.
+ *
+ * LS outputs = TCS inputs
+ * TCS outputs = TES inputs
+ *
+ * The LDS layout is:
+ * - TCS inputs for patch 0
+ * - TCS inputs for patch 1
+ * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
+ * - ...
+ * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
+ * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
+ * - TCS outputs for patch 1
+ * - Per-patch TCS outputs for patch 1
+ * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
+ * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
+ * - ...
+ *
+ * All three shaders VS(LS), TCS, TES share the same LDS space.
+ */
+
+static LLVMValueRef
+get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX)
+		return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
+	else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+		return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
+	else {
+		assert(0);
+		return NULL;
+	}
+}
+
+static LLVMValueRef
+get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     0, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     16, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_patch_data_offset =
+		get_tcs_out_patch0_patch_data_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static void build_indexed_store(struct si_shader_context *si_shader_ctx,
+				LLVMValueRef base_ptr, LLVMValueRef index,
+				LLVMValueRef value)
+{
+	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef indices[2], pointer;
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = index;
+
+	pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
+	LLVMBuildStore(gallivm->builder, value, pointer);
+}
+
 /**
  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
  * It's equivalent to doing a load from &base_ptr[index].
@@ -308,7 +463,7 @@ static void declare_input_vs(
 	args[0] = t_list;
 	args[1] = attribute_offset;
 	args[2] = buffer_index;
-	input = build_intrinsic(gallivm->builder,
+	input = lp_build_intrinsic(gallivm->builder,
 		"llvm.SI.vs.load.input", vec4_type, args, 3,
 		LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -323,6 +478,285 @@ static void declare_input_vs(
 	}
 }
 
+static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
+				     unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+
+	if (swizzle > 0)
+		return bld_base->uint_bld.zero;
+
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_VERTEX:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_vs_prim_id);
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PATCH_ID);
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_patch_id);
+	case TGSI_PROCESSOR_GEOMETRY:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PRIMITIVE_ID);
+	default:
+		assert(0);
+		return bld_base->uint_bld.zero;
+	}
+}
+
+/**
+ * Return the value of tgsi_ind_register for indexing.
+ * This is the indirect index with the constant offset added to it.
+ */
+static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
+				       const struct tgsi_ind_register *ind,
+				       int rel_index)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	LLVMValueRef result;
+
+	result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
+	result = LLVMBuildLoad(gallivm->builder, result, "");
+	result = LLVMBuildAdd(gallivm->builder, result,
+			      lp_build_const_int32(gallivm, rel_index), "");
+	return result;
+}
+
+/**
+ * Calculate a dword address given an input or output register and a stride.
+ */
+static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
+				   const struct tgsi_full_dst_register *dst,
+				   const struct tgsi_full_src_register *src,
+				   LLVMValueRef vertex_dw_stride,
+				   LLVMValueRef base_addr)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
+	ubyte *name, *index, *array_first;
+	int first, param;
+	struct tgsi_full_dst_register reg;
+
+	/* Set the register description. The address computation is the same
+	 * for sources and destinations. */
+	if (src) {
+		reg.Register.File = src->Register.File;
+		reg.Register.Index = src->Register.Index;
+		reg.Register.Indirect = src->Register.Indirect;
+		reg.Register.Dimension = src->Register.Dimension;
+		reg.Indirect = src->Indirect;
+		reg.Dimension = src->Dimension;
+		reg.DimIndirect = src->DimIndirect;
+	} else
+		reg = *dst;
+
+	/* If the register is 2-dimensional (e.g. an array of vertices
+	 * in a primitive), calculate the base address of the vertex. */
+	if (reg.Register.Dimension) {
+		LLVMValueRef index;
+
+		if (reg.Dimension.Indirect)
+			index = get_indirect_index(si_shader_ctx, &reg.DimIndirect,
+						   reg.Dimension.Index);
+		else
+			index = lp_build_const_int32(gallivm, reg.Dimension.Index);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+					 LLVMBuildMul(gallivm->builder, index,
+						      vertex_dw_stride, ""), "");
+	}
+
+	/* Get information about the register. */
+	if (reg.Register.File == TGSI_FILE_INPUT) {
+		name = info->input_semantic_name;
+		index = info->input_semantic_index;
+		array_first = info->input_array_first;
+	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+		name = info->output_semantic_name;
+		index = info->output_semantic_index;
+		array_first = info->output_array_first;
+	} else {
+		assert(0);
+		return NULL;
+	}
+
+	if (reg.Register.Indirect) {
+		/* Add the relative address of the element. */
+		LLVMValueRef ind_index;
+
+		if (reg.Indirect.ArrayID)
+			first = array_first[reg.Indirect.ArrayID];
+		else
+			first = reg.Register.Index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg.Indirect,
+					   reg.Register.Index - first);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+				    LLVMBuildMul(gallivm->builder, ind_index,
+						 lp_build_const_int32(gallivm, 4), ""), "");
+
+		param = si_shader_io_get_unique_index(name[first], index[first]);
+	} else {
+		param = si_shader_io_get_unique_index(name[reg.Register.Index],
+						      index[reg.Register.Index]);
+	}
+
+	/* Add the base address of the element. */
+	return LLVMBuildAdd(gallivm->builder, base_addr,
+			    lp_build_const_int32(gallivm, param * 4), "");
+}
+
+/**
+ * Load from LDS.
+ *
+ * \param type		output value type
+ * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
+ * \param dw_addr	address in dwords
+ */
+static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
+			     enum tgsi_opcode_type type, unsigned swizzle,
+			     LLVMValueRef dw_addr)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef value;
+
+	if (swizzle == ~0) {
+		LLVMValueRef values[TGSI_NUM_CHANNELS];
+
+		for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
+			values[chan] = lds_load(bld_base, type, chan, dw_addr);
+
+		return lp_build_gather_values(bld_base->base.gallivm, values,
+					      TGSI_NUM_CHANNELS);
+	}
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	return LLVMBuildBitCast(gallivm->builder, value,
+				tgsi2llvmtype(bld_base, type), "");
+}
+
+/**
+ * Store to LDS.
+ *
+ * \param swizzle	offset (typically 0..3)
+ * \param dw_addr	address in dwords
+ * \param value		value to store
+ */
+static void lds_store(struct lp_build_tgsi_context * bld_base,
+		      unsigned swizzle, LLVMValueRef dw_addr,
+		      LLVMValueRef value)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = LLVMBuildBitCast(gallivm->builder, value,
+				 LLVMInt32TypeInContext(gallivm->context), "");
+	build_indexed_store(si_shader_ctx, si_shader_ctx->lds,
+			    dw_addr, value);
+}
+
+static LLVMValueRef fetch_input_tcs(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+	dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx);
+	dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_output_tcs(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_src_register *reg,
+		enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_input_tes(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
+			     const struct tgsi_full_instruction * inst,
+			     const struct tgsi_opcode_info * info,
+			     LLVMValueRef dst[4])
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+	unsigned chan_index;
+	LLVMValueRef dw_addr, stride;
+
+	/* Only handle per-patch and per-vertex outputs here.
+	 * Vectors will be lowered to scalars and this function will be called again.
+	 */
+	if (reg->Register.File != TGSI_FILE_OUTPUT ||
+	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
+		radeon_llvm_emit_store(bld_base, inst, info, dst);
+		return;
+	}
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr);
+	}
+
+	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+		LLVMValueRef value = dst[chan_index];
+
+		if (inst->Instruction.Saturate)
+			value = radeon_llvm_saturate(bld_base, value);
+
+		lds_store(bld_base, chan_index, dw_addr, value);
+	}
+}
+
 static LLVMValueRef fetch_input_gs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -342,13 +776,8 @@ static LLVMValueRef fetch_input_gs(
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 
-	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) {
-		if (swizzle == 0)
-			return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_PRIMITIVE_ID);
-		else
-			return uint->zero;
-	}
+	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
+		return get_primitive_id(bld_base, swizzle);
 
 	if (!reg->Register.Dimension)
 		return NULL;
@@ -380,7 +809,7 @@ static LLVMValueRef fetch_input_gs(
 	args[1] = vtx_offset;
 	args[2] = lp_build_const_int32(gallivm,
 				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->gs_used_inputs) * 4 +
+							shader->selector->inputs_read) * 4 +
 					swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
@@ -390,13 +819,42 @@ static LLVMValueRef fetch_input_gs(
 	args[8] = uint->zero; /* TFE */
 
 	return LLVMBuildBitCast(gallivm->builder,
-				build_intrinsic(gallivm->builder,
+				lp_build_intrinsic(gallivm->builder,
 						"llvm.SI.buffer.load.dword.i32.i32",
 						i32, args, 9,
 						LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
 				tgsi2llvmtype(bld_base, type), "");
 }
 
+static int lookup_interp_param_index(unsigned interpolate, unsigned location)
+{
+	switch (interpolate) {
+	case TGSI_INTERPOLATE_CONSTANT:
+		return 0;
+
+	case TGSI_INTERPOLATE_LINEAR:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_LINEAR_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_LINEAR_CENTROID;
+		else
+			return SI_PARAM_LINEAR_CENTER;
+		break;
+	case TGSI_INTERPOLATE_COLOR:
+	case TGSI_INTERPOLATE_PERSPECTIVE:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_PERSP_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_PERSP_CENTROID;
+		else
+			return SI_PARAM_PERSP_CENTER;
+		break;
+	default:
+		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+		return -1;
+	}
+}
+
 static void declare_input_fs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
@@ -411,7 +869,8 @@ static void declare_input_fs(
 	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 	LLVMValueRef main_fn = radeon_bld->main_fn;
 
-	LLVMValueRef interp_param;
+	LLVMValueRef interp_param = NULL;
+	int interp_param_idx;
 	const char * intr_name;
 
 	/* This value is:
@@ -460,31 +919,13 @@ static void declare_input_fs(
 	attr_number = lp_build_const_int32(gallivm,
 					   shader->ps_input_param_offset[input_index]);
 
-	switch (decl->Interp.Interpolate) {
-	case TGSI_INTERPOLATE_CONSTANT:
-		interp_param = 0;
-		break;
-	case TGSI_INTERPOLATE_LINEAR:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTER);
-		break;
-	case TGSI_INTERPOLATE_COLOR:
-	case TGSI_INTERPOLATE_PERSPECTIVE:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTER);
-		break;
-	default:
-		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+	shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
+	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
+						     decl->Interp.Location);
+	if (interp_param_idx == -1)
 		return;
-	}
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(main_fn, interp_param_idx);
 
 	/* fs.constant returns the param from the middle vertex, so it's not
 	 * really useful for flat shading. It's meant to be used for custom
@@ -522,12 +963,12 @@ static void declare_input_fs(
 
 			args[0] = llvm_chan;
 			args[1] = attr_number;
-			front = build_intrinsic(gallivm->builder, intr_name,
+			front = lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 			args[1] = back_attr_number;
-			back = build_intrinsic(gallivm->builder, intr_name,
+			back = lp_build_intrinsic(gallivm->builder, intr_name,
 					       input_type, args, args[3] ? 4 : 3,
 					       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -548,7 +989,7 @@ static void declare_input_fs(
 		args[2] = params;
 		args[3] = interp_param;
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			build_intrinsic(gallivm->builder, intr_name,
+			lp_build_intrinsic(gallivm->builder, intr_name,
 					input_type, args, args[3] ? 4 : 3,
 					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
@@ -566,7 +1007,7 @@ static void declare_input_fs(
 			args[2] = params;
 			args[3] = interp_param;
 			radeon_bld->inputs[soa_index] =
-				build_intrinsic(gallivm->builder, intr_name,
+				lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		}
@@ -587,10 +1028,35 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou
 {
 	LLVMValueRef args[2] = {resource, offset};
 
-	return build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
+	return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
 			       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 }
 
+static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
+{
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
+	struct gallivm_state *gallivm = &radeon_bld->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
+	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
+
+	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
+	LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
+	LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
+
+	LLVMValueRef pos[4] = {
+		buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
+		buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
+		lp_build_const_float(gallivm, 0),
+		lp_build_const_float(gallivm, 0)
+	};
+
+	return lp_build_gather_values(gallivm, pos, 4);
+}
+
 static void declare_system_value(
 	struct radeon_llvm_context * radeon_bld,
 	unsigned index,
@@ -598,6 +1064,7 @@ static void declare_system_value(
 {
 	struct si_shader_context *si_shader_ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMValueRef value = 0;
@@ -626,30 +1093,23 @@ static void declare_system_value(
 				     SI_PARAM_BASE_VERTEX);
 		break;
 
+	case TGSI_SEMANTIC_INVOCATIONID:
+		if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+			value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+		else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY)
+			value = LLVMGetParam(radeon_bld->main_fn,
+					     SI_PARAM_GS_INSTANCE_ID);
+		else
+			assert(!"INVOCATIONID not implemented");
+		break;
+
 	case TGSI_SEMANTIC_SAMPLEID:
 		value = get_sample_id(radeon_bld);
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEPOS:
-	{
-		LLVMBuilderRef builder = gallivm->builder;
-		LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
-		LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
-		LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
-
-		/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-		LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, get_sample_id(radeon_bld), 8);
-		LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
-
-		LLVMValueRef pos[4] = {
-			buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
-			buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
-			lp_build_const_float(gallivm, 0),
-			lp_build_const_float(gallivm, 0)
-		};
-		value = lp_build_gather_values(gallivm, pos, 4);
+		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
 		break;
-	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
 		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
@@ -660,6 +1120,48 @@ static void declare_system_value(
 			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 		break;
 
+	case TGSI_SEMANTIC_TESSCOORD:
+	{
+		LLVMValueRef coord[4] = {
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u),
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v),
+			bld->zero,
+			bld->zero
+		};
+
+		/* For triangles, the vector should be (u, v, 1-u-v). */
+		if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
+		    PIPE_PRIM_TRIANGLES)
+			coord[2] = lp_build_sub(bld, bld->one,
+						lp_build_add(bld, coord[0], coord[1]));
+
+		value = lp_build_gather_values(gallivm, coord, 4);
+		break;
+	}
+
+	case TGSI_SEMANTIC_VERTICESIN:
+		value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		break;
+
+	case TGSI_SEMANTIC_TESSINNER:
+	case TGSI_SEMANTIC_TESSOUTER:
+	{
+		LLVMValueRef dw_addr;
+		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
+
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
+				       lp_build_const_int32(gallivm, param * 4), "");
+
+		value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
+				 ~0, dw_addr);
+		break;
+	}
+
+	case TGSI_SEMANTIC_PRIMID:
+		value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
+		break;
+
 	default:
 		assert(!"unknown system value");
 		return;
@@ -679,7 +1181,7 @@ static LLVMValueRef fetch_constant(
 	const struct tgsi_ind_register *ireg = &reg->Indirect;
 	unsigned buf, idx;
 
-	LLVMValueRef addr;
+	LLVMValueRef addr, bufp;
 	LLVMValueRef result;
 
 	if (swizzle == LP_CHAN_ALL) {
@@ -694,8 +1196,24 @@ static LLVMValueRef fetch_constant(
 	buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
 	idx = reg->Register.Index * 4 + swizzle;
 
-	if (!reg->Register.Indirect)
-		return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+	if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
+		if (type != TGSI_TYPE_DOUBLE)
+			return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+		else {
+			return radeon_llvm_emit_fetch_double(bld_base,
+							     si_shader_ctx->constants[buf][idx],
+							     si_shader_ctx->constants[buf][idx + 1]);
+		}
+	}
+
+	if (reg->Register.Dimension && reg->Dimension.Indirect) {
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef index;
+		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
+						   reg->Dimension.Index);
+		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
+	} else
+		bufp = si_shader_ctx->const_resource[buf];
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -703,10 +1221,26 @@ static LLVMValueRef fetch_constant(
 	addr = lp_build_add(&bld_base->uint_bld, addr,
 			    lp_build_const_int32(base->gallivm, idx * 4));
 
-	result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
-			    addr, base->elem_type);
+	result = buffer_load_const(base->gallivm->builder, bufp,
+				   addr, bld_base->base.elem_type);
+
+	if (type != TGSI_TYPE_DOUBLE)
+		result = bitcast(bld_base, type, result);
+	else {
+		LLVMValueRef addr2, result2;
+		addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
+		addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
+		addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
+		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
+				     lp_build_const_int32(base->gallivm, idx * 4));
+
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+				   addr2, bld_base->base.elem_type);
 
-	return bitcast(bld_base, type, result);
+		result = radeon_llvm_emit_fetch_double(bld_base,
+					               result, result2);
+	}
+	return result;
 }
 
 /* Initialize arguments for the shader export intrinsic */
@@ -745,7 +1279,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 			args[0] = values[2 * chan];
 			args[1] = values[2 * chan + 1];
 			args[chan + 5] =
-				build_intrinsic(base->gallivm->builder,
+				lp_build_intrinsic(base->gallivm->builder,
 						"llvm.SI.packf16",
 						LLVMInt32TypeInContext(base->gallivm->context),
 						args, 2,
@@ -827,12 +1361,12 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 					lp_build_const_float(gallivm, 1.0f),
 					lp_build_const_float(gallivm, -1.0f));
 
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kill",
 				LLVMVoidTypeInContext(gallivm->context),
 				&arg, 1, 0);
 	} else {
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kilp",
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
@@ -853,7 +1387,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				SI_PARAM_SAMPLE_COVERAGE);
 	coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
-	coverage = build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
+	coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
 				   bld_base->int_bld.elem_type,
 				   &coverage, 1, LLVMReadNoneAttribute);
 
@@ -983,16 +1517,16 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 
 	lp_build_intrinsic(gallivm->builder, name,
 			   LLVMVoidTypeInContext(gallivm->context),
-			   args, Elements(args));
+			   args, Elements(args), 0);
 }
 
-static void build_streamout_store(struct si_shader_context *shader,
-				  LLVMValueRef rsrc,
-				  LLVMValueRef vdata,
-				  unsigned num_channels,
-				  LLVMValueRef vaddr,
-				  LLVMValueRef soffset,
-				  unsigned inst_offset)
+static void build_tbuffer_store_dwords(struct si_shader_context *shader,
+				     LLVMValueRef rsrc,
+				     LLVMValueRef vdata,
+				     unsigned num_channels,
+				     LLVMValueRef vaddr,
+				     LLVMValueRef soffset,
+				     unsigned inst_offset)
 {
 	static unsigned dfmt[] = {
 		V_008F0C_BUF_DATA_FORMAT_32,
@@ -1025,13 +1559,16 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 	LLVMValueRef so_vtx_count =
 		unpack_param(shader, shader->param_streamout_config, 16, 7);
 
-	LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32,
+	LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32,
 					   NULL, 0, LLVMReadNoneAttribute);
 
 	/* can_emit = tid < so_vtx_count; */
 	LLVMValueRef can_emit =
 		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 
+	LLVMValueRef stream_id =
+		unpack_param(shader, shader->param_streamout_config, 24, 2);
+
 	/* Emit the streamout code conditionally. This actually avoids
 	 * out-of-bounds buffer access. The hw tells us via the SGPR
 	 * (so_vtx_count) which threads are allowed to emit streamout data. */
@@ -1071,7 +1608,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			unsigned reg = so->output[i].register_index;
 			unsigned start = so->output[i].start_component;
 			unsigned num_comps = so->output[i].num_components;
+			unsigned stream = so->output[i].stream;
 			LLVMValueRef out[4];
+			struct lp_build_if_state if_ctx_stream;
 
 			assert(num_comps && num_comps <= 4);
 			if (!num_comps || num_comps > 4)
@@ -1105,11 +1644,18 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 				break;
 			}
 
-			build_streamout_store(shader, shader->so_buffers[buf_idx],
-					      vdata, num_comps,
-					      so_write_offset[buf_idx],
-					      LLVMConstInt(i32, 0, 0),
-					      so->output[i].dst_offset*4);
+			LLVMValueRef can_emit_stream =
+				LLVMBuildICmp(builder, LLVMIntEQ,
+					      stream_id,
+					      lp_build_const_int32(gallivm, stream), "");
+
+			lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
+			build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
+						   vdata, num_comps,
+						   so_write_offset[buf_idx],
+						   LLVMConstInt(i32, 0, 0),
+						   so->output[i].dst_offset*4);
+			lp_build_endif(&if_ctx_stream);
 		}
 	}
 	lp_build_endif(&if_ctx);
@@ -1128,7 +1674,7 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	LLVMValueRef args[9];
 	LLVMValueRef pos_args[4][9] = { { 0 } };
-	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL;
+	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 	unsigned semantic_name, semantic_index;
 	unsigned target;
 	unsigned param_count = 0;
@@ -1154,7 +1700,12 @@ handle_semantic:
 			continue;
 		case TGSI_SEMANTIC_LAYER:
 			layer_value = outputs[i].values[0];
-			continue;
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+			viewport_index_value = outputs[i].values[0];
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
 		case TGSI_SEMANTIC_POSITION:
 			target = V_008DFC_SQ_EXP_POS;
 			break;
@@ -1195,7 +1746,7 @@ handle_semantic:
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 
 		if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
@@ -1204,6 +1755,8 @@ handle_semantic:
 		}
 	}
 
+	shader->nr_param_exports = param_count;
+
 	/* We need to add the position output manually if it's missing. */
 	if (!pos_args[0][0]) {
 		pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
@@ -1220,11 +1773,13 @@ handle_semantic:
 	/* Write the misc vector (point size, edgeflag, layer, viewport). */
 	if (shader->selector->info.writes_psize ||
 	    shader->selector->info.writes_edgeflag ||
+	    shader->selector->info.writes_viewport_index ||
 	    shader->selector->info.writes_layer) {
 		pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
 						      shader->selector->info.writes_psize |
 						      (shader->selector->info.writes_edgeflag << 1) |
-						      (shader->selector->info.writes_layer << 2));
+						      (shader->selector->info.writes_layer << 2) |
+						      (shader->selector->info.writes_viewport_index << 3));
 		pos_args[1][1] = uint->zero; /* EXEC mask */
 		pos_args[1][2] = uint->zero; /* last export? */
 		pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
@@ -1255,6 +1810,9 @@ handle_semantic:
 
 		if (shader->selector->info.writes_layer)
 			pos_args[1][7] = layer_value;
+
+		if (shader->selector->info.writes_viewport_index)
+			pos_args[1][8] = viewport_index_value;
 	}
 
 	for (i = 0; i < 4; i++)
@@ -1276,7 +1834,133 @@ handle_semantic:
 		lp_build_intrinsic(base->gallivm->builder,
 				   "llvm.SI.export",
 				   LLVMVoidTypeInContext(base->gallivm->context),
-				   pos_args[i], 9);
+				   pos_args[i], 9, 0);
+	}
+}
+
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct si_shader *shader = si_shader_ctx->shader;
+	unsigned tess_inner_index, tess_outer_index;
+	LLVMValueRef lds_base, lds_inner, lds_outer;
+	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
+	LLVMValueRef out[6], vec0, vec1, invocation_id;
+	unsigned stride, outer_comps, inner_comps, i;
+	struct lp_build_if_state if_ctx;
+
+	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+
+	/* Do this only for invocation 0, because the tess levels are per-patch,
+	 * not per-vertex.
+	 *
+	 * This can't jump, because invocation 0 executes this. It should
+	 * at least mask out the loads and stores for other invocations.
+	 */
+	lp_build_if(&if_ctx, gallivm,
+		    LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
+				  invocation_id, bld_base->uint_bld.zero, ""));
+
+	/* Determine the layout of one tess factor element in the buffer. */
+	switch (shader->key.tcs.prim_mode) {
+	case PIPE_PRIM_LINES:
+		stride = 2; /* 2 dwords, 1 vec2 store */
+		outer_comps = 2;
+		inner_comps = 0;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		stride = 4; /* 4 dwords, 1 vec4 store */
+		outer_comps = 3;
+		inner_comps = 1;
+		break;
+	case PIPE_PRIM_QUADS:
+		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+		outer_comps = 4;
+		inner_comps = 2;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	/* Load tess_inner and tess_outer from LDS.
+	 * Any invocation can write them, so we can't get them from a temporary.
+	 */
+	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
+	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
+
+	lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_inner_index * 4), "");
+	lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_outer_index * 4), "");
+
+	for (i = 0; i < outer_comps; i++)
+		out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
+	for (i = 0; i < inner_comps; i++)
+		out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+
+	/* Convert the outputs to vectors for stores. */
+	vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
+	vec1 = NULL;
+
+	if (stride > 4)
+		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
+
+	/* Get the buffer. */
+	rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
+	buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
+			lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
+
+	/* Get the offset. */
+	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+			       SI_PARAM_TESS_FACTOR_OFFSET);
+	rel_patch_id = get_rel_patch_id(si_shader_ctx);
+	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
+				  lp_build_const_int32(gallivm, 4 * stride), "");
+
+	/* Store the outputs. */
+	build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0,
+				   MIN2(stride, 4), byteoffset, tf_base, 0);
+	if (vec1)
+		build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1,
+					   stride - 4, byteoffset, tf_base, 16);
+	lp_build_endif(&if_ctx);
+}
+
+static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	unsigned i, chan;
+	LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+					      si_shader_ctx->param_rel_auto_id);
+	LLVMValueRef vertex_dw_stride =
+		unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
+	LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
+						 vertex_dw_stride, "");
+
+	/* Write outputs to LDS. The next shader (TCS aka HS) will read
+	 * its inputs from it. */
+	for (i = 0; i < info->num_outputs; i++) {
+		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
+		unsigned name = info->output_semantic_name[i];
+		unsigned index = info->output_semantic_index[i];
+		int param = si_shader_io_get_unique_index(name, index);
+		LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
+					lp_build_const_int32(gallivm, param * 4), "");
+
+		for (chan = 0; chan < 4; chan++) {
+			lds_store(bld_base, chan, dw_addr,
+				  LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
+		}
 	}
 }
 
@@ -1288,17 +1972,25 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct tgsi_shader_info *info = &es->selector->info;
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_ES2GS_OFFSET);
+					    si_shader_ctx->param_es2gs_offset);
+	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
+					   es->key.tes.es_enabled_outputs :
+					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
 			si_shader_ctx->radeon_bld.soa.outputs[i];
-		int param_index = get_param_index(info->output_semantic_name[i],
-						  info->output_semantic_index[i],
-						  es->key.vs.gs_used_inputs);
+		int param_index;
 
+		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+			continue;
+
+		param_index = get_param_index(info->output_semantic_name[i],
+					      info->output_semantic_index[i],
+					      enabled_outputs);
 		if (param_index < 0)
 			continue;
 
@@ -1326,7 +2018,7 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -1339,7 +2031,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
-	outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
+	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	for (i = 0; i < info->num_outputs; i++) {
 		outputs[i].name = info->output_semantic_name[i];
@@ -1352,7 +2044,19 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 					      "");
 	}
 
-	si_llvm_export_vs(bld_base, outputs, info->num_outputs);
+	/* Export PrimitiveID when PS needs it. */
+	if (si_vs_exports_prim_id(si_shader_ctx->shader)) {
+		outputs[i].name = TGSI_SEMANTIC_PRIMID;
+		outputs[i].sid = 0;
+		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+					       get_primitive_id(bld_base, 0));
+		outputs[i].values[1] = bld_base->base.undef;
+		outputs[i].values[2] = bld_base->base.undef;
+		outputs[i].values[3] = bld_base->base.undef;
+		i++;
+	}
+
+	si_llvm_export_vs(bld_base, outputs, i);
 	FREE(outputs);
 }
 
@@ -1417,7 +2121,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 				lp_build_intrinsic(base->gallivm->builder,
 						   "llvm.SI.export",
 						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9);
+						   last_args, 9, 0);
 			}
 
 			/* This instruction will be emitted at the end of the shader. */
@@ -1434,14 +2138,14 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 					lp_build_intrinsic(base->gallivm->builder,
 							   "llvm.SI.export",
 							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9);
+							   args, 9, 0);
 				}
 			}
 		} else {
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 	}
 
@@ -1503,7 +2207,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		else
 			memcpy(last_args, args, sizeof(args));
 	}
@@ -1534,7 +2238,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	lp_build_intrinsic(base->gallivm->builder,
 			   "llvm.SI.export",
 			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9);
+			   last_args, 9, 0);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
@@ -1563,15 +2267,36 @@ static void tex_fetch_args(
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	unsigned opcode = inst->Instruction.Opcode;
 	unsigned target = inst->Texture.Texture;
-	LLVMValueRef coords[5];
+	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
 	int ref_pos;
 	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
 	unsigned count = 0;
 	unsigned chan;
-	unsigned sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
-	unsigned sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+	unsigned sampler_src;
+	unsigned sampler_index;
+	unsigned num_deriv_channels = 0;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
+	LLVMValueRef res_ptr, samp_ptr;
+
+	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
+	sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+
+	if (emit_data->inst->Src[sampler_src].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+
+		samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+	} else {
+		res_ptr = si_shader_ctx->resources[sampler_index];
+		samp_ptr = si_shader_ctx->samplers[sampler_index];
+	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
@@ -1580,7 +2305,7 @@ static void tex_fetch_args(
 		LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
 
 		/* Bitcast and truncate v8i32 to v16i8. */
-		LLVMValueRef res = si_shader_ctx->resources[sampler_index];
+		LLVMValueRef res = res_ptr;
 		res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
 		res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
 		res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
@@ -1649,18 +2374,13 @@ static void tex_fetch_args(
 		}
 	}
 
-	if (target == TGSI_TEXTURE_CUBE ||
-	    target == TGSI_TEXTURE_CUBE_ARRAY ||
-	    target == TGSI_TEXTURE_SHADOWCUBE ||
-	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-
 	/* Pack user derivatives */
 	if (opcode == TGSI_OPCODE_TXD) {
-		int num_deriv_channels, param;
+		int param, num_src_deriv_channels;
 
 		switch (target) {
 		case TGSI_TEXTURE_3D:
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 3;
 			break;
 		case TGSI_TEXTURE_2D:
@@ -1669,27 +2389,44 @@ static void tex_fetch_args(
 		case TGSI_TEXTURE_SHADOWRECT:
 		case TGSI_TEXTURE_2D_ARRAY:
 		case TGSI_TEXTURE_SHADOW2D_ARRAY:
+			num_src_deriv_channels = 2;
+			num_deriv_channels = 2;
+			break;
 		case TGSI_TEXTURE_CUBE:
 		case TGSI_TEXTURE_SHADOWCUBE:
 		case TGSI_TEXTURE_CUBE_ARRAY:
 		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+			/* Cube derivatives will be converted to 2D. */
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 2;
 			break;
 		case TGSI_TEXTURE_1D:
 		case TGSI_TEXTURE_SHADOW1D:
 		case TGSI_TEXTURE_1D_ARRAY:
 		case TGSI_TEXTURE_SHADOW1D_ARRAY:
+			num_src_deriv_channels = 1;
 			num_deriv_channels = 1;
 			break;
 		default:
 			assert(0); /* no other targets are valid here */
 		}
 
-		for (param = 1; param <= 2; param++)
-			for (chan = 0; chan < num_deriv_channels; chan++)
-				address[count++] = lp_build_emit_fetch(bld_base, inst, param, chan);
+		for (param = 0; param < 2; param++)
+			for (chan = 0; chan < num_src_deriv_channels; chan++)
+				derivs[param * num_src_deriv_channels + chan] =
+					lp_build_emit_fetch(bld_base, inst, param+1, chan);
 	}
 
+	if (target == TGSI_TEXTURE_CUBE ||
+	    target == TGSI_TEXTURE_CUBE_ARRAY ||
+	    target == TGSI_TEXTURE_SHADOWCUBE ||
+	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
+
+	if (opcode == TGSI_OPCODE_TXD)
+		for (int i = 0; i < num_deriv_channels * 2; i++)
+			address[count++] = derivs[i];
+
 	/* Pack texture coordinates */
 	address[count++] = coords[0];
 	if (num_coords > 1)
@@ -1806,7 +2543,7 @@ static void tex_fetch_args(
 	}
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[sampler_index];
+	emit_data->args[1] = res_ptr;
 
 	if (opcode == TGSI_OPCODE_TXF) {
 		/* add tex offsets */
@@ -1889,7 +2626,7 @@ static void tex_fetch_args(
 			dmask = 1 << gather_comp;
 		}
 
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, dmask);
 		emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */
 		emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */
@@ -1905,7 +2642,7 @@ static void tex_fetch_args(
 			LLVMFloatTypeInContext(gallivm->context),
 			4);
 	} else {
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, target);
 		emit_data->arg_count = 4;
 
@@ -1940,7 +2677,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 				emit_data->inst->Texture.NumOffsets > 0 : false;
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder,
 			"llvm.SI.vs.load.input", emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
@@ -1989,7 +2726,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 			is_shadow ? ".c" : "", infix, has_offset ? ".o" : "",
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2036,7 +2773,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 		sprintf(intr_name, "%s.v%ui32", name,
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2050,17 +2787,47 @@ static void txq_fetch_args(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Texture.Texture;
+	LLVMValueRef res_ptr;
+
+	if (inst->Src[1].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &inst->Src[1];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr,
+						   ind_index);
+	} else
+		res_ptr = si_shader_ctx->resources[inst->Src[1].Register.Index];
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 		LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
 
 		/* Read the size from the buffer descriptor directly. */
-		LLVMValueRef size = si_shader_ctx->resources[inst->Src[1].Register.Index];
-		size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
-		size = LLVMBuildExtractElement(gallivm->builder, size,
-					      lp_build_const_int32(gallivm, 6), "");
+		LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, "");
+		LLVMValueRef size = LLVMBuildExtractElement(builder, res,
+						lp_build_const_int32(gallivm, 6), "");
+
+		if (si_shader_ctx->screen->b.chip_class >= VI) {
+			/* On VI, the descriptor contains the size in bytes,
+			 * but TXQ must return the size in elements.
+			 * The stride is always non-zero for resources using TXQ.
+			 */
+			LLVMValueRef stride =
+				LLVMBuildExtractElement(builder, res,
+							lp_build_const_int32(gallivm, 5), "");
+			stride = LLVMBuildLShr(builder, stride,
+					       lp_build_const_int32(gallivm, 16), "");
+			stride = LLVMBuildAnd(builder, stride,
+					      lp_build_const_int32(gallivm, 0x3FFF), "");
+
+			size = LLVMBuildUDiv(builder, size, stride, "");
+		}
+
 		emit_data->args[0] = size;
 		return;
 	}
@@ -2069,7 +2836,7 @@ static void txq_fetch_args(
 	emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[inst->Src[1].Register.Index];
+	emit_data->args[1] = res_ptr;
 
 	/* Texture target */
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
@@ -2116,6 +2883,35 @@ static void build_txq_intrinsic(const struct lp_build_tgsi_action * action,
 	}
 }
 
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+/* masks for thread ID. */
+#define TID_MASK_TOP_LEFT 0xfffffffc
+#define TID_MASK_TOP      0xfffffffd
+#define TID_MASK_LEFT     0xfffffffe
+
 static void si_llvm_emit_ddxy(
 	const struct lp_build_tgsi_action * action,
 	struct lp_build_tgsi_context * bld_base,
@@ -2132,25 +2928,34 @@ static void si_llvm_emit_ddxy(
 	LLVMTypeRef i32;
 	unsigned swizzle[4];
 	unsigned c;
+	int idx;
+	unsigned mask;
 
 	i32 = LLVMInt32TypeInContext(gallivm->context);
 
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
 				     NULL, 0, LLVMReadNoneAttribute);
-	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	if (opcode == TGSI_OPCODE_DDX_FINE)
+		mask = TID_MASK_LEFT;
+	else if (opcode == TGSI_OPCODE_DDY_FINE)
+		mask = TID_MASK_TOP;
+	else
+		mask = TID_MASK_TOP_LEFT;
+
 	indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm, 0xfffffffc), "");
-	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+				  lp_build_const_int32(gallivm, mask), "");
+	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	/* for DDX we want to next X pixel, DDY next Y pixel. */
+	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
 	indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm,
-						       opcode == TGSI_OPCODE_DDX ? 1 : 2),
-				  "");
-	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+				  lp_build_const_int32(gallivm, idx), "");
+	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
 	for (c = 0; c < 4; ++c) {
@@ -2184,6 +2989,247 @@ static void si_llvm_emit_ddxy(
 	emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
 }
 
+/*
+ * this takes an I,J coordinate pair,
+ * and works out the X and Y derivatives.
+ * it returns DDX(I), DDX(J), DDY(I), DDY(J).
+ */
+static LLVMValueRef si_llvm_emit_ddxy_interp(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef interp_ij)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef indices[2];
+	LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
+	LLVMValueRef tl, tr, bl, result[4];
+	LLVMTypeRef i32;
+	unsigned c;
+
+	i32 = LLVMInt32TypeInContext(gallivm->context);
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+					NULL, 0, LLVMReadNoneAttribute);
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				 indices, 2, "");
+
+	temp = LLVMBuildAnd(gallivm->builder, indices[1],
+			    lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
+
+	temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
+			     lp_build_const_int32(gallivm, TID_MASK_TOP), "");
+
+	indices[1] = temp;
+	load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = temp2;
+	load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp,
+				  lp_build_const_int32(gallivm, 1), "");
+	load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
+				  lp_build_const_int32(gallivm, 2), "");
+	load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	for (c = 0; c < 2; ++c) {
+		LLVMValueRef store_val;
+		LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
+
+		store_val = LLVMBuildExtractElement(gallivm->builder,
+						    interp_ij, c_ll, "");
+		LLVMBuildStore(gallivm->builder,
+			       store_val,
+			       store_ptr);
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
+		tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, "");
+
+		result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
+		bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, "");
+
+		result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
+	}
+
+	return lp_build_gather_values(gallivm, result, 4);
+}
+
+static void interp_fetch_args(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
+		/* offset is in second src, first two channels */
+		emit_data->args[0] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 0);
+		emit_data->args[1] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 1);
+		emit_data->arg_count = 2;
+	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef sample_position;
+		LLVMValueRef sample_id;
+		LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
+
+		/* fetch sample ID, then fetch its sample position,
+		 * and place into first two channels.
+		 */
+		sample_id = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 1, 0);
+		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
+					     LLVMInt32TypeInContext(gallivm->context),
+					     "");
+		sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id);
+
+		emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 0), "");
+
+		emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
+		emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 1), "");
+		emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
+		emit_data->arg_count = 2;
+	}
+}
+
+static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef interp_param;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const char *intr_name;
+	int input_index;
+	int chan;
+	int i;
+	LLVMValueRef attr_number;
+	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
+	LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
+	int interp_param_idx;
+	unsigned location;
+
+	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
+	input_index = inst->Src[0].Register.Index;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
+		location = TGSI_INTERPOLATE_LOC_CENTER;
+	else
+		location = TGSI_INTERPOLATE_LOC_CENTROID;
+
+	interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
+						     location);
+	if (interp_param_idx == -1)
+		return;
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx);
+	else
+		interp_param = NULL;
+
+	attr_number = lp_build_const_int32(gallivm,
+					   shader->ps_input_param_offset[input_index]);
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef ij_out[2];
+		LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
+
+		/*
+		 * take the I then J parameters, and the DDX/Y for it, and
+		 * calculate the IJ inputs for the interpolator.
+		 * temp1 = ddx * offset/sample.x + I;
+		 * interp_param.I = ddy * offset/sample.y + temp1;
+		 * temp1 = ddx * offset/sample.x + J;
+		 * interp_param.J = ddy * offset/sample.y + temp1;
+		 */
+		for (i = 0; i < 2; i++) {
+			LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
+			LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
+			LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, ix_ll, "");
+			LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, iy_ll, "");
+			LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
+									 interp_param, ix_ll, "");
+			LLVMValueRef temp1, temp2;
+
+			interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
+						     LLVMFloatTypeInContext(gallivm->context), "");
+
+			temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
+
+			temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
+
+			temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
+
+			temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
+
+			ij_out[i] = LLVMBuildBitCast(gallivm->builder,
+						     temp2,
+						     LLVMIntTypeInContext(gallivm->context, 32), "");
+		}
+		interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
+	}
+
+	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
+	for (chan = 0; chan < 2; chan++) {
+		LLVMValueRef args[4];
+		LLVMValueRef llvm_chan;
+		unsigned schan;
+
+		schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
+		llvm_chan = lp_build_const_int32(gallivm, schan);
+
+		args[0] = llvm_chan;
+		args[1] = attr_number;
+		args[2] = params;
+		args[3] = interp_param;
+
+		emit_data->output[chan] =
+			lp_build_intrinsic(gallivm->builder, intr_name,
+					   input_type, args, args[3] ? 4 : 3,
+					   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+	}
+}
+
+static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
+				       struct lp_build_emit_data *emit_data)
+{
+	LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
+	struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
+	unsigned stream;
+
+	assert(src0.File == TGSI_FILE_IMMEDIATE);
+
+	stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
+	return stream;
+}
+
 /* Emit one vertex from the geometry shader */
 static void si_llvm_emit_vertex(
 	const struct lp_build_tgsi_action *action,
@@ -2203,9 +3249,14 @@ static void si_llvm_emit_vertex(
 	LLVMValueRef args[2];
 	unsigned chan;
 	int i;
+	unsigned stream;
+
+	stream = si_llvm_get_stream(bld_base, emit_data);
 
 	/* Write vertex attribute values to GSVS ring */
-	gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
+	gs_next_vertex = LLVMBuildLoad(gallivm->builder,
+				       si_shader_ctx->gs_next_vertex[stream],
+				       "");
 
 	/* If this thread has already emitted the declared maximum number of
 	 * vertices, kill it: excessive vertex emissions are not supposed to
@@ -2218,8 +3269,9 @@ static void si_llvm_emit_vertex(
 	kill = lp_build_select(&bld_base->base, can_emit,
 			       lp_build_const_float(gallivm, 1.0f),
 			       lp_build_const_float(gallivm, -1.0f));
-	build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
-			LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+			   LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
@@ -2237,7 +3289,7 @@ static void si_llvm_emit_vertex(
 			out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
 
 			build_tbuffer_store(si_shader_ctx,
-					    si_shader_ctx->gsvs_ring,
+					    si_shader_ctx->gsvs_ring[stream],
 					    out_val, 1,
 					    voffset, soffset, 0,
 					    V_008F0C_BUF_DATA_FORMAT_32,
@@ -2247,12 +3299,13 @@ static void si_llvm_emit_vertex(
 	}
 	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
 				      lp_build_const_int32(gallivm, 1));
-	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
+
+	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
 
 	/* Signal vertex emission */
-	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
+	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -2266,15 +3319,28 @@ static void si_llvm_emit_primitive(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef args[2];
+	unsigned stream;
 
 	/* Signal primitive cut */
-	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS);
+	stream = si_llvm_get_stream(bld_base, emit_data);
+	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
 
+static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
+				 struct lp_build_tgsi_context *bld_base,
+				 struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
+			LLVMNoUnwindAttribute);
+}
+
 static const struct lp_build_tgsi_action tex_action = {
 	.fetch_args = tex_fetch_args,
 	.emit = build_tex_intrinsic,
@@ -2286,6 +3352,11 @@ static const struct lp_build_tgsi_action txq_action = {
 	.intr_name = "llvm.SI.resinfo"
 };
 
+static const struct lp_build_tgsi_action interp_action = {
+	.fetch_args = interp_fetch_args,
+	.emit = build_interp_intrinsic,
+};
+
 static void create_meta_data(struct si_shader_context *si_shader_ctx)
 {
 	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -2304,6 +3375,27 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 			       CONST_ADDR_SPACE);
 }
 
+static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
+				     struct pipe_stream_output_info *so,
+				     LLVMTypeRef *params, LLVMTypeRef i32,
+				     unsigned *num_params)
+{
+	int i;
+
+	/* Streamout SGPRs. */
+	if (so->num_outputs) {
+		params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32;
+		params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32;
+	}
+	/* A streamout buffer offset is loaded if the stride is non-zero. */
+	for (i = 0; i < 4; i++) {
+		if (!so->stride[i])
+			continue;
+
+		params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+	}
+}
+
 static void create_function(struct si_shader_context *si_shader_ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -2336,8 +3428,10 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		num_params = SI_PARAM_START_INSTANCE+1;
 
 		if (shader->key.vs.as_es) {
-			params[SI_PARAM_ES2GS_OFFSET] = i32;
-			num_params++;
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else if (shader->key.vs.as_ls) {
+			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
+			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST;
@@ -2345,30 +3439,52 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			}
 
 			/* The locations of the other parameters are assigned dynamically. */
-
-			/* Streamout SGPRs. */
-			if (shader->selector->so.num_outputs) {
-				params[si_shader_ctx->param_streamout_config = num_params++] = i32;
-				params[si_shader_ctx->param_streamout_write_index = num_params++] = i32;
-			}
-			/* A streamout buffer offset is loaded if the stride is non-zero. */
-			for (i = 0; i < 4; i++) {
-				if (!shader->selector->so.stride[i])
-					continue;
-
-				params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32;
-			}
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
 		}
 
 		last_sgpr = num_params-1;
 
 		/* VGPRs */
 		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
-		params[num_params++] = i32; /* unused*/
-		params[num_params++] = i32; /* unused */
+		params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
+		params[si_shader_ctx->param_vs_prim_id = num_params++] = i32;
 		params[si_shader_ctx->param_instance_id = num_params++] = i32;
 		break;
 
+	case TGSI_PROCESSOR_TESS_CTRL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		params[SI_PARAM_TCS_IN_LAYOUT] = i32;
+		params[SI_PARAM_TESS_FACTOR_OFFSET] = i32;
+		last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+
+		/* VGPRs */
+		params[SI_PARAM_PATCH_ID] = i32;
+		params[SI_PARAM_REL_IDS] = i32;
+		num_params = SI_PARAM_REL_IDS+1;
+		break;
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
+
+		if (shader->key.tes.as_es) {
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else {
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
+		}
+		last_sgpr = num_params - 1;
+
+		/* VGPRs */
+		params[si_shader_ctx->param_tes_u = num_params++] = f32;
+		params[si_shader_ctx->param_tes_v = num_params++] = f32;
+		params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32;
+		params[si_shader_ctx->param_tes_patch_id = num_params++] = i32;
+		break;
+
 	case TGSI_PROCESSOR_GEOMETRY:
 		params[SI_PARAM_GS2VS_OFFSET] = i32;
 		params[SI_PARAM_GS_WAVE_ID] = i32;
@@ -2435,12 +3551,35 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0))
-		si_shader_ctx->ddxy_lds =
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
+		si_shader_ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
 						    LLVMArrayType(i32, 64),
 						    "ddxy_lds",
 						    LOCAL_ADDR_SPACE);
+
+	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
+		/* This is the upper bound, maximum is 32 inputs times 32 vertices */
+		unsigned vertex_data_dw_size = 32*32*4;
+		unsigned patch_data_dw_size = 32*4;
+		/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+		unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+		unsigned lds_dwords = patch_dw_size;
+
+		/* The actual size is computed outside of the shader to reduce
+		 * the number of shader variants. */
+		si_shader_ctx->lds =
+			LLVMAddGlobalInAddressSpace(gallivm->module,
+						    LLVMArrayType(i32, lds_dwords),
+						    "tess_lds",
+						    LOCAL_ADDR_SPACE);
+	}
 }
 
 static void preload_constants(struct si_shader_context *si_shader_ctx)
@@ -2517,9 +3656,13 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	unsigned i;
 
-	if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX ||
-	    si_shader_ctx->shader->key.vs.as_es ||
-	    !si_shader_ctx->shader->selector->so.num_outputs)
+	/* Streamout can only be used if the shader is compiled as VS. */
+	if (!si_shader_ctx->shader->selector->so.num_outputs ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
+	     (si_shader_ctx->shader->key.vs.as_es ||
+	      si_shader_ctx->shader->key.vs.as_ls)) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es))
 		return;
 
 	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -2550,6 +3693,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 
 	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
 	     si_shader_ctx->shader->key.vs.as_es) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es) ||
 	    si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
 
@@ -2557,13 +3702,21 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
-	    si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->shader->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
-		si_shader_ctx->gsvs_ring =
+		si_shader_ctx->gsvs_ring[0] =
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
+	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+		int i;
+		for (i = 0; i < 4; i++) {
+			LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
+
+			si_shader_ctx->gsvs_ring[i] =
+				build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+		}
+	}
 }
 
 void si_shader_binary_read_config(const struct si_screen *sscreen,
@@ -2637,26 +3790,54 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
-int si_shader_binary_read(struct si_screen *sscreen,
-			struct si_shader *shader,
-			const struct radeon_shader_binary *binary)
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 {
+	const struct radeon_shader_binary *binary = &shader->binary;
+	unsigned code_size = binary->code_size + binary->rodata_size;
+	unsigned char *ptr;
+
+	r600_resource_reference(&shader->bo, NULL);
+	shader->bo = si_resource_create_custom(&sscreen->b.b,
+					       PIPE_USAGE_IMMUTABLE,
+					       code_size);
+	if (!shader->bo)
+		return -ENOMEM;
 
+	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+					PIPE_TRANSFER_READ_WRITE);
+	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
+	if (binary->rodata_size > 0) {
+		ptr += binary->code_size;
+		util_memcpy_cpu_to_le32(ptr, binary->rodata,
+					binary->rodata_size);
+	}
+
+	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	return 0;
+}
+
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+{
+	const struct radeon_shader_binary *binary = &shader->binary;
 	unsigned i;
-	unsigned code_size;
-	unsigned char *ptr;
 	bool dump  = r600_can_dump_shader(&sscreen->b,
 		shader->selector ? shader->selector->tokens : NULL);
 
 	si_shader_binary_read_config(sscreen, shader, 0);
+	si_shader_binary_upload(sscreen, shader);
 
 	if (dump) {
-		if (!binary->disassembled) {
-			fprintf(stderr, "SI CODE:\n");
-			for (i = 0; i < binary->code_size; i+=4 ) {
-				fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-				binary->code[i + 2], binary->code[i + 1],
-				binary->code[i]);
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
+			if (binary->disasm_string) {
+				fprintf(stderr, "\nShader Disassembly:\n\n");
+				fprintf(stderr, "%s\n", binary->disasm_string);
+			} else {
+				fprintf(stderr, "SI CODE:\n");
+				for (i = 0; i < binary->code_size; i+=4 ) {
+					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
+					binary->code[i + 2], binary->code[i + 1],
+					binary->code[i]);
+				}
 			}
 		}
 
@@ -2666,26 +3847,6 @@ int si_shader_binary_read(struct si_screen *sscreen,
 			shader->num_sgprs, shader->num_vgprs, binary->code_size,
 			shader->lds_size, shader->scratch_bytes_per_wave);
 	}
-
-	/* copy new shader */
-	code_size = binary->code_size + binary->rodata_size;
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE,
-					       code_size);
-	if (shader->bo == NULL) {
-		return -ENOMEM;
-	}
-
-
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-	if (binary->rodata_size > 0) {
-		ptr += binary->code_size;
-		util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
-	}
-
-	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
-
 	return 0;
 }
 
@@ -2693,15 +3854,16 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
 {
 	int r = 0;
-	bool dump = r600_can_dump_shader(&sscreen->b,
-			shader->selector ? shader->selector->tokens : NULL);
-	r = radeon_llvm_compile(mod, &shader->binary,
-		r600_get_llvm_processor_name(sscreen->b.family), dump, tm);
+	bool dump_asm = r600_can_dump_shader(&sscreen->b,
+				shader->selector ? shader->selector->tokens : NULL);
+	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 
-	if (r) {
+	r = radeon_llvm_compile(mod, &shader->binary,
+		r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+	if (r)
 		return r;
-	}
-	r = si_shader_binary_read(sscreen, shader, &shader->binary);
+
+	r = si_shader_binary_read(sscreen, shader);
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
@@ -2709,7 +3871,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	if (shader->scratch_bytes_per_wave == 0) {
 		FREE(shader->binary.code);
 		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0, sizeof(shader->binary));
+		memset(&shader->binary, 0,
+		       offsetof(struct radeon_shader_binary, disasm_string));
 	}
 	return r;
 }
@@ -2741,7 +3904,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	preload_streamout_buffers(si_shader_ctx);
 	preload_ring_buffers(si_shader_ctx);
 
-	args[0] = si_shader_ctx->gsvs_ring;
+	args[0] = si_shader_ctx->gsvs_ring[0];
 	args[1] = lp_build_mul_imm(uint,
 				   LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 						si_shader_ctx->param_vertex_id),
@@ -2767,7 +3930,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 			outputs[i].values[chan] =
 				LLVMBuildBitCast(gallivm->builder,
-						 build_intrinsic(gallivm->builder,
+						 lp_build_intrinsic(gallivm->builder,
 								 "llvm.SI.buffer.load.dword.i32.i32",
 								 LLVMInt32TypeInContext(gallivm->context),
 								 args, 9,
@@ -2807,9 +3970,21 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
 		fprintf(stderr, "}\n");
 
 		if (key->vs.as_es)
-			fprintf(stderr, "  gs_used_inputs = 0x%"PRIx64"\n",
-				key->vs.gs_used_inputs);
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->vs.es_enabled_outputs);
 		fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
+		fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
+		break;
+
+	case PIPE_SHADER_TESS_CTRL:
+		fprintf(stderr, "  prim_mode = %u\n", key->tcs.prim_mode);
+		break;
+
+	case PIPE_SHADER_TESS_EVAL:
+		if (key->tes.as_es)
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->tes.es_enabled_outputs);
+		fprintf(stderr, "  as_es = %u\n", key->tes.as_es);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
@@ -2851,7 +4026,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
 	 * conversion fails. */
-	if (dump) {
+	if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
 		si_dump_key(sel->type, &shader->key);
 		tgsi_dump(tokens, 0);
 		si_dump_streamout(&sel->so);
@@ -2873,6 +4048,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
+	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
+
 	bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
@@ -2888,9 +4067,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
 
 	bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
 	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
+	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 
 	if (HAVE_LLVM >= 0x0306) {
 		bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
@@ -2908,11 +4090,25 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	switch (si_shader_ctx.type) {
 	case TGSI_PROCESSOR_VERTEX:
 		si_shader_ctx.radeon_bld.load_input = declare_input_vs;
-		if (shader->key.vs.as_es) {
+		if (shader->key.vs.as_ls)
+			bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
+		else if (shader->key.vs.as_es)
 			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
-		} else {
+		else
+			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
+		bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
+		bld_base->emit_store = store_output_tcs;
+		bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
+		if (shader->key.tes.as_es)
+			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
+		else
 			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
-		}
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
 		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
@@ -2946,9 +4142,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	preload_ring_buffers(&si_shader_ctx);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		si_shader_ctx.gs_next_vertex =
-			lp_build_alloca(bld_base->base.gallivm,
-					bld_base->uint_bld.elem_type, "");
+		int i;
+		for (i = 0; i < 4; i++) {
+			si_shader_ctx.gs_next_vertex[i] =
+				lp_build_alloca(bld_base->base.gallivm,
+						bld_base->uint_bld.elem_type, "");
+		}
 	}
 
 	if (!lp_build_tgsi_llvm(bld_base, tokens)) {
@@ -3000,4 +4199,5 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
 
 	FREE(shader->binary.code);
 	FREE(shader->binary.relocs);
+	FREE(shader->binary.disasm_string);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 51055afe36a..cd845c12e64 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,6 +26,46 @@
  *      Christian König <[email protected]>
  */
 
+/* How linking tessellation shader inputs and outputs works.
+ *
+ * Inputs and outputs between shaders are stored in a buffer. This buffer
+ * lives in LDS (typical case for tessellation), but it can also live
+ * in memory. Each input or output has a fixed location within a vertex.
+ * The highest used input or output determines the stride between vertices.
+ *
+ * Since tessellation is only enabled in the OpenGL core profile,
+ * only these semantics are valid for per-vertex data:
+ *
+ *   Name             Location
+ *
+ *   POSITION         0
+ *   PSIZE            1
+ *   CLIPDIST0..1     2..3
+ *   CULLDIST0..1     (not implemented)
+ *   GENERIC0..31     4..35
+ *
+ * For example, a shader only writing GENERIC0 has the output stride of 5.
+ *
+ * Only these semantics are valid for per-patch data:
+ *
+ *   Name             Location
+ *
+ *   TESSOUTER        0
+ *   TESSINNER        1
+ *   PATCH0..29       2..31
+ *
+ * That's how independent shaders agree on input and output locations.
+ * The si_shader_io_get_unique_index function assigns the locations.
+ *
+ * Other required information for calculating the input and output addresses
+ * like the vertex stride, the patch stride, and the offsets where per-vertex
+ * and per-patch data start, is passed to the shader via user data SGPRs.
+ * The offsets and strides are calculated at draw time and aren't available
+ * at compile time.
+ *
+ * The same approach should be used for linking ES->GS in the future.
+ */
+
 #ifndef SI_SHADER_H
 #define SI_SHADER_H
 
@@ -43,9 +83,16 @@ struct radeon_shader_reloc;
 #define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
+#define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
+#define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
+#define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
+#define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
 #define SI_SGPR_ALPHA_REF	8  /* PS only */
 
 #define SI_VS_NUM_USER_SGPR	12
+#define SI_LS_NUM_USER_SGPR	13
+#define SI_TCS_NUM_USER_SGPR	11
+#define SI_TES_NUM_USER_SGPR	10
 #define SI_GS_NUM_USER_SGPR	8
 #define SI_GSCOPY_NUM_USER_SGPR	4
 #define SI_PS_NUM_USER_SGPR	9
@@ -62,8 +109,30 @@ struct radeon_shader_reloc;
 #define SI_PARAM_START_INSTANCE	6
 /* the other VS parameters are assigned dynamically */
 
-/* ES only parameters */
-#define SI_PARAM_ES2GS_OFFSET	7
+/* Offsets where TCS outputs and TCS patch outputs live in LDS:
+ *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+ *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
+ */
+#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */
+
+/* Layout of TCS outputs / TES inputs:
+ *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
+ *   [26:31] = gl_PatchVerticesIn, max = 32
+ */
+#define SI_PARAM_TCS_OUT_LAYOUT	5 /* for TCS & TES */
+
+/* Layout of LS outputs / TCS inputs
+ *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
+ */
+#define SI_PARAM_TCS_IN_LAYOUT	6 /* TCS only */
+#define SI_PARAM_LS_OUT_LAYOUT	7 /* same value as TCS_IN_LAYOUT, LS only */
+
+/* TCS only parameters. */
+#define SI_PARAM_TESS_FACTOR_OFFSET 7
+#define SI_PARAM_PATCH_ID	8
+#define SI_PARAM_REL_IDS	9
 
 /* GS only parameters */
 #define SI_PARAM_GS2VS_OFFSET	4
@@ -115,9 +184,25 @@ struct si_shader_selector {
 
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
-	uint64_t	gs_used_inputs; /* mask of "get_unique_index" bits */
+	unsigned	gs_num_invocations;
+
+	/* masks of "get_unique_index" bits */
+	uint64_t	inputs_read;
+	uint64_t	outputs_written;
+	uint32_t	patch_outputs_written;
 };
 
+/* Valid shader configurations:
+ *
+ * API shaders       VS | TCS | TES | GS |pass| PS
+ * are compiled as:     |     |     |    |thru|
+ *                      |     |     |    |    |
+ * Only VS & PS:     VS | --  | --  | -- | -- | PS
+ * With GS:          ES | --  | --  | GS | VS | PS
+ * With Tessel.:     LS | HS  | VS  | -- | -- | PS
+ * With both:        LS | HS  | ES  | GS | VS | PS
+ */
+
 union si_shader_key {
 	struct {
 		unsigned	export_16bpc:8;
@@ -130,11 +215,25 @@ union si_shader_key {
 	} ps;
 	struct {
 		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
-		/* The mask of "get_unique_index" bits, needed for ES,
-		 * it describes how the ES->GS ring buffer is laid out. */
-		uint64_t	gs_used_inputs;
-		unsigned	as_es:1;
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+		unsigned	as_ls:1; /* local shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
 	} vs;
+	struct {
+		unsigned	prim_mode:3;
+	} tcs; /* tessellation control shader */
+	struct {
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+	} tes; /* tessellation evaluation shader */
 };
 
 struct si_shader {
@@ -161,27 +260,47 @@ struct si_shader {
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
 	unsigned		ps_input_param_offset[PIPE_MAX_SHADER_INPUTS];
-
+	unsigned		ps_input_interpolate[PIPE_MAX_SHADER_INPUTS];
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
+	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
+
+	unsigned		ls_rsrc1;
+	unsigned		ls_rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	return sctx->gs_shader ? &sctx->gs_shader->info
-                               : &sctx->vs_shader->info;
+	if (sctx->gs_shader)
+		return &sctx->gs_shader->info;
+	else if (sctx->tes_shader)
+		return &sctx->tes_shader->info;
+	else
+		return &sctx->vs_shader->info;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
 	if (sctx->gs_shader)
 		return sctx->gs_shader->current->gs_copy_shader;
+	else if (sctx->tes_shader)
+		return sctx->tes_shader->current;
 	else
 		return sctx->vs_shader->current;
 }
 
+static inline bool si_vs_exports_prim_id(struct si_shader *shader)
+{
+	if (shader->selector->type == PIPE_SHADER_VERTEX)
+		return shader->key.vs.export_prim_id;
+	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		return shader->key.tes.export_prim_id;
+	else
+		return false;
+}
+
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader);
@@ -189,8 +308,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-		const struct radeon_shader_binary *binary);
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 6c18836d189..c923ea7e154 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -61,7 +61,7 @@ unsigned si_array_mode(unsigned mode)
 
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 {
-	if (sscreen->b.chip_class == CIK &&
+	if (sscreen->b.chip_class >= CIK &&
 	    sscreen->b.info.cik_macrotile_mode_array_valid) {
 		unsigned index, tileb;
 
@@ -489,11 +489,14 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 		S_02881C_USE_VTX_POINT_SIZE(info->writes_psize) |
 		S_02881C_USE_VTX_EDGE_FLAG(info->writes_edgeflag) |
 		S_02881C_USE_VTX_RENDER_TARGET_INDX(info->writes_layer) |
+	        S_02881C_USE_VTX_VIEWPORT_INDX(info->writes_viewport_index) |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipdist_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipdist_mask & 0xF0) != 0) |
 		S_02881C_VS_OUT_MISC_VEC_ENA(info->writes_psize ||
 					    info->writes_edgeflag ||
-					    info->writes_layer) |
+					    info->writes_layer ||
+					     info->writes_viewport_index) |
+		S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) |
 		(sctx->queued.named.rasterizer->clip_plane_enable &
 		 clipdist_mask));
 	r600_write_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -509,20 +512,26 @@ static void si_set_scissor_states(struct pipe_context *ctx,
                                   const struct pipe_scissor_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_scissor *scissor = CALLOC_STRUCT(si_state_scissor);
-	struct si_pm4_state *pm4 = &scissor->pm4;
-
-	if (scissor == NULL)
-		return;
+	struct si_state_scissor *scissor;
+	struct si_pm4_state *pm4;
+	int i;
 
-	scissor->scissor = *state;
-	si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
-		       S_028250_TL_X(state->minx) | S_028250_TL_Y(state->miny) |
-		       S_028250_WINDOW_OFFSET_DISABLE(1));
-	si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR,
-		       S_028254_BR_X(state->maxx) | S_028254_BR_Y(state->maxy));
+	for (i = start_slot; i < start_slot + num_scissors; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 2;
 
-	si_pm4_set_state(sctx, scissor, scissor);
+		scissor = CALLOC_STRUCT(si_state_scissor);
+		if (scissor == NULL)
+			return;
+		pm4 = &scissor->pm4;
+		scissor->scissor = state[idx];
+		si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset,
+			       S_028250_TL_X(state[idx].minx) | S_028250_TL_Y(state[idx].miny) |
+			       S_028250_WINDOW_OFFSET_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR + offset,
+			       S_028254_BR_X(state[idx].maxx) | S_028254_BR_Y(state[idx].maxy));
+		si_pm4_set_state(sctx, scissor[i], scissor);
+	}
 }
 
 static void si_set_viewport_states(struct pipe_context *ctx,
@@ -531,21 +540,29 @@ static void si_set_viewport_states(struct pipe_context *ctx,
                                    const struct pipe_viewport_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_viewport *viewport = CALLOC_STRUCT(si_state_viewport);
-	struct si_pm4_state *pm4 = &viewport->pm4;
+	struct si_state_viewport *viewport;
+	struct si_pm4_state *pm4;
+	int i;
 
-	if (viewport == NULL)
-		return;
+	for (i = start_slot; i < start_slot + num_viewports; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 6;
 
-	viewport->viewport = *state;
-	si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0]));
-	si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0]));
-	si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1]));
-	si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1]));
-	si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2]));
-	si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2]));
+		viewport = CALLOC_STRUCT(si_state_viewport);
+		if (!viewport)
+			return;
+		pm4 = &viewport->pm4;
+
+		viewport->viewport = state[idx];
+		si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE + offset, fui(state[idx].scale[0]));
+		si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET + offset, fui(state[idx].translate[0]));
+		si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE + offset, fui(state[idx].scale[1]));
+		si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET + offset, fui(state[idx].translate[1]));
+		si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE + offset, fui(state[idx].scale[2]));
+		si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET + offset, fui(state[idx].translate[2]));
 
-	si_pm4_set_state(sctx, viewport, viewport);
+		si_pm4_set_state(sctx, viewport[i], viewport);
+	}
 }
 
 /*
@@ -649,7 +666,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 
 	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
 		S_0286D4_FLAT_SHADE_ENA(1) |
@@ -718,12 +735,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	if (sctx->framebuffer.nr_samples > 1 &&
 	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_fb_rs_state(sctx);
 
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -821,7 +838,8 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 
 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
-		S_028800_ZFUNC(state->depth.func);
+		S_028800_ZFUNC(state->depth.func) |
+		S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
 
 	/* stencil */
 	if (state->stencil[0].enabled) {
@@ -850,9 +868,12 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 		dsa->alpha_func = PIPE_FUNC_ALWAYS;
 	}
 
-	/* misc */
 	si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+	if (state->depth.bounds_test) {
+		si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+		si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+	}
 
 	return dsa;
 }
@@ -888,7 +909,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
@@ -1157,7 +1178,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 				       int first_non_void)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool enable_s3tc = sscreen->b.info.drm_minor >= 31;
+	bool enable_compressed_formats = (sscreen->b.info.drm_major == 2 &&
+					  sscreen->b.info.drm_minor >= 31) ||
+					 sscreen->b.info.drm_major == 3;
 	boolean uniform = TRUE;
 	int i;
 
@@ -1200,7 +1223,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1220,7 +1243,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1249,8 +1272,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		if (!util_format_s3tc_enabled) {
@@ -1606,7 +1628,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
 	unsigned retval = 0;
 
 	if (target >= PIPE_MAX_TEXTURE_TYPES) {
@@ -1618,8 +1639,7 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 		return FALSE;
 
 	if (sample_count > 1) {
-		/* 2D tiling on CIK is supported since DRM 2.35.0 */
-		if (sscreen->b.chip_class >= CIK && sscreen->b.info.drm_minor < 35)
+		if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
 			return FALSE;
 
 		switch (sample_count) {
@@ -1826,6 +1846,9 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	surf->cb_color_info = color_info;
 	surf->cb_color_attrib = color_attrib;
 
+	if (sctx->b.chip_class >= VI)
+		surf->cb_dcc_control = S_028C78_OVERWRITE_COMBINER_DISABLE(1);
+
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (offset + rtex->fmask.offset) >> 8;
 		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
@@ -2023,7 +2046,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 				  util_format_is_pure_integer(state->cbufs[0]->format);
 
 	if (sctx->framebuffer.cb0_is_integer != old_cb0_is_integer)
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	for (i = 0; i < state->nr_cbufs; i++) {
 		if (!state->cbufs[i])
@@ -2043,6 +2066,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (rtex->fmask.size && rtex->cmask.size) {
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 	/* Set the 16BPC export for possible dual-src blending. */
 	if (i == 1 && surf && surf->export_16bpc) {
@@ -2057,20 +2081,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 
 	si_update_fb_rs_state(sctx);
 	si_update_fb_blend_state(sctx);
 
-	sctx->framebuffer.atom.num_dw = state->nr_cbufs*15 + (8 - state->nr_cbufs)*3;
+	sctx->framebuffer.atom.num_dw = state->nr_cbufs*16 + (8 - state->nr_cbufs)*3;
 	sctx->framebuffer.atom.num_dw += state->zsbuf ? 26 : 4;
 	sctx->framebuffer.atom.num_dw += 3; /* WINDOW_SCISSOR_BR */
 	sctx->framebuffer.atom.num_dw += 18; /* MSAA sample locations */
-	sctx->framebuffer.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
-		sctx->msaa_config.dirty = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 		/* Set sample locations as fragment shader constants. */
 		switch (sctx->framebuffer.nr_samples) {
@@ -2107,7 +2132,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		     old_nr_samples != SI_NUM_SMOOTH_AA_SAMPLES) &&
 		    (sctx->framebuffer.nr_samples != SI_NUM_SMOOTH_AA_SAMPLES ||
 		     old_nr_samples != 1))
-			sctx->msaa_sample_locs.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs);
 	}
 }
 
@@ -2141,20 +2166,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 				RADEON_PRIO_COLOR_META);
 		}
 
-		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 13);
+		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+					   sctx->b.chip_class >= VI ? 14 : 13);
 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
 		radeon_emit(cs, cb->cb_color_info | tex->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
-		radeon_emit(cs, 0);			/* R_028C78 unused */
+		radeon_emit(cs, cb->cb_dcc_control);	/* R_028C78_CB_COLOR0_DCC_CONTROL */
 		radeon_emit(cs, tex->cmask.base_address_reg);	/* R_028C7C_CB_COLOR0_CMASK */
 		radeon_emit(cs, tex->cmask.slice_tile_max);	/* R_028C80_CB_COLOR0_CMASK_SLICE */
 		radeon_emit(cs, cb->cb_color_fmask);		/* R_028C84_CB_COLOR0_FMASK */
 		radeon_emit(cs, cb->cb_color_fmask_slice);	/* R_028C88_CB_COLOR0_FMASK_SLICE */
 		radeon_emit(cs, tex->color_clear_value[0]);	/* R_028C8C_CB_COLOR0_CLEAR_WORD0 */
 		radeon_emit(cs, tex->color_clear_value[1]);	/* R_028C90_CB_COLOR0_CLEAR_WORD1 */
+
+		if (sctx->b.chip_class >= VI)
+			radeon_emit(cs, 0);	/* R_028C94_CB_COLOR0_DCC_BASE */
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0]) {
@@ -2249,22 +2278,35 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 	sctx->ps_iter_samples = min_samples;
 
 	if (sctx->framebuffer.nr_samples > 1)
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 /*
  * Samplers
  */
 
-static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
-							struct pipe_resource *texture,
-							const struct pipe_sampler_view *state)
+/**
+ * Create a sampler view.
+ *
+ * @param ctx		context
+ * @param texture	texture
+ * @param state		sampler view template
+ * @param width0	width0 override (for compressed textures as int)
+ * @param height0	height0 override (for compressed textures as int)
+ * @param force_level   set the base address to the level (for compressed textures)
+ */
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
 	struct r600_texture *tmp = (struct r600_texture*)texture;
 	const struct util_format_description *desc;
-	unsigned format, num_format;
+	unsigned format, num_format, base_level, first_level, last_level;
 	uint32_t pitch = 0;
 	unsigned char state_swizzle[4], swizzle[4];
 	unsigned height, depth, width;
@@ -2297,7 +2339,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 
 	/* Buffer resource. */
 	if (texture->target == PIPE_BUFFER) {
-		unsigned stride;
+		unsigned stride, num_records;
 
 		desc = util_format_description(state->format);
 		first_non_void = util_format_get_first_non_void_channel(state->format);
@@ -2306,10 +2348,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 		format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 
+		num_records = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		num_records = MIN2(num_records, texture->width0 / stride);
+
+		if (sctx->b.chip_class >= VI)
+			num_records *= stride;
+
 		view->state[4] = va;
 		view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 				 S_008F04_STRIDE(stride);
-		view->state[6] = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		view->state[6] = num_records;
 		view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
 				 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
 				 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
@@ -2437,13 +2485,25 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 		format = 0;
 	}
 
-	/* not supported any more */
-	//endian = si_colorformat_endian_swap(format);
+	base_level = 0;
+	first_level = state->u.tex.first_level;
+	last_level = state->u.tex.last_level;
+	width = width0;
+	height = height0;
+	depth = texture->depth0;
 
-	width = surflevel[0].npix_x;
-	height = surflevel[0].npix_y;
-	depth = surflevel[0].npix_z;
-	pitch = surflevel[0].nblk_x * util_format_get_blockwidth(pipe_format);
+	if (force_level) {
+		assert(force_level == first_level &&
+		       force_level == last_level);
+		base_level = force_level;
+		first_level = 0;
+		last_level = 0;
+		width = u_minify(width, force_level);
+		height = u_minify(height, force_level);
+		depth = u_minify(depth, force_level);
+	}
+
+	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
 
 	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -2453,8 +2513,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
-	va = tmp->resource.gpu_address + surflevel[0].offset;
-	va += tmp->mipmap_shift * surflevel[texture->last_level].slice_size * tmp->surface.array_size;
+	va = tmp->resource.gpu_address + surflevel[base_level].offset;
 
 	view->state[0] = va >> 8;
 	view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
@@ -2467,11 +2526,11 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 			  S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
 			  S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
 			  S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ?
-						      0 : state->u.tex.first_level - tmp->mipmap_shift) |
+						      0 : first_level) |
 			  S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ?
 						      util_logbase2(texture->nr_samples) :
-						      state->u.tex.last_level - tmp->mipmap_shift) |
-			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, 0, false)) |
+						      last_level) |
+			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
 			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
@@ -2523,6 +2582,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	return &view->base;
 }
 
+static struct pipe_sampler_view *
+si_create_sampler_view(struct pipe_context *ctx,
+		       struct pipe_resource *texture,
+		       const struct pipe_sampler_view *state)
+{
+	return si_create_sampler_view_custom(ctx, texture, state,
+					     texture ? texture->width0 : 0,
+					     texture ? texture->height0 : 0, 0);
+}
+
 static void si_sampler_view_destroy(struct pipe_context *ctx,
 				    struct pipe_sampler_view *state)
 {
@@ -2765,6 +2834,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer, src->buffer);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			r600_context_add_resource_size(ctx, src->buffer);
 		}
 	} else {
 		for (i = 0; i < count; i++) {
@@ -2782,6 +2852,7 @@ static void si_set_index_buffer(struct pipe_context *ctx,
 	if (ib) {
 		pipe_resource_reference(&sctx->index_buffer.buffer, ib->buffer);
 	        memcpy(&sctx->index_buffer, ib, sizeof(*ib));
+		r600_context_add_resource_size(ctx, ib->buffer);
 	} else {
 		pipe_resource_reference(&sctx->index_buffer.buffer, NULL);
 	}
@@ -2845,6 +2916,30 @@ static void si_set_polygon_stipple(struct pipe_context *ctx,
 	}
 }
 
+static void si_set_tess_state(struct pipe_context *ctx,
+			      const float default_outer_level[4],
+			      const float default_inner_level[2])
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct pipe_constant_buffer cb;
+	float array[8];
+
+	memcpy(array, default_outer_level, sizeof(float) * 4);
+	memcpy(array+4, default_inner_level, sizeof(float) * 2);
+
+	cb.buffer = NULL;
+	cb.user_buffer = NULL;
+	cb.buffer_size = sizeof(array);
+
+	si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer,
+			       (void*)array, sizeof(array),
+			       &cb.buffer_offset);
+
+	ctx->set_constant_buffer(ctx, PIPE_SHADER_TESS_CTRL,
+				 SI_DRIVER_STATE_CONST_BUF, &cb);
+	pipe_resource_reference(&cb.buffer, NULL);
+}
+
 static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -2870,6 +2965,8 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
 	si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo);
 }
 
+static void si_init_config(struct si_context *sctx);
+
 void si_init_state_functions(struct si_context *sctx)
 {
 	si_init_atom(&sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state, 0);
@@ -2920,6 +3017,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.texture_barrier = si_texture_barrier;
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
+	sctx->b.b.set_tess_state = si_set_tess_state;
 
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
@@ -2931,24 +3029,31 @@ void si_init_state_functions(struct si_context *sctx)
 	} else {
 		sctx->b.dma_copy = si_dma_copy;
 	}
+
+	si_init_config(sctx);
 }
 
 static void
 si_write_harvested_raster_configs(struct si_context *sctx,
 				  struct si_pm4_state *pm4,
-				  unsigned raster_config)
+				  unsigned raster_config,
+				  unsigned raster_config_1)
 {
 	unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1);
 	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-	unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-	unsigned rb_per_pkr = num_rb / num_se / sh_per_se;
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
 	unsigned rb_per_se = num_rb / num_se;
-	unsigned se0_mask = (1 << rb_per_se) - 1;
-	unsigned se1_mask = se0_mask << rb_per_se;
+	unsigned se_mask[4];
 	unsigned se;
 
-	assert(num_se == 1 || num_se == 2);
+	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+	assert(num_se == 1 || num_se == 2 || num_se == 4);
 	assert(sh_per_se == 1 || sh_per_se == 2);
 	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
 
@@ -2956,17 +3061,16 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 	 * fields are for, so I'm leaving them as their default
 	 * values. */
 
-	se0_mask &= rb_mask;
-	se1_mask &= rb_mask;
-	if (num_se == 2 && (!se0_mask || !se1_mask)) {
-		raster_config &= C_028350_SE_MAP;
+	if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
+			     (!se_mask[2] && !se_mask[3]))) {
+		raster_config_1 &= C_028354_SE_PAIR_MAP;
 
-		if (!se0_mask) {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+		if (!se_mask[0] && !se_mask[1]) {
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
 		} else {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
 		}
 	}
 
@@ -2974,10 +3078,23 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 		unsigned raster_config_se = raster_config;
 		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
 		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+		int idx = (se / 2) * 2;
+
+		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+			raster_config_se &= C_028350_SE_MAP;
+
+			if (!se_mask[idx]) {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+			} else {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			}
+		}
 
 		pkr0_mask &= rb_mask;
 		pkr1_mask &= rb_mask;
-		if (sh_per_se == 2 && (!pkr0_mask || !pkr1_mask)) {
+		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
 			raster_config_se &= C_028350_PKR_MAP;
 
 			if (!pkr0_mask) {
@@ -2989,7 +3106,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		if (rb_per_pkr == 2) {
+		if (rb_per_se >= 2) {
 			unsigned rb0_mask = 1 << (se * rb_per_se);
 			unsigned rb1_mask = rb0_mask << 1;
 
@@ -3007,7 +3124,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 				}
 			}
 
-			if (sh_per_se == 2) {
+			if (rb_per_se > 2) {
 				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
 				rb1_mask = rb0_mask << 1;
 				rb0_mask &= rb_mask;
@@ -3026,19 +3143,28 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-			       SE_INDEX(se) | SH_BROADCAST_WRITES |
-			       INSTANCE_BROADCAST_WRITES);
+		/* GRBM_GFX_INDEX is privileged on VI */
+		if (sctx->b.chip_class <= CIK)
+			si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+				       SE_INDEX(se) | SH_BROADCAST_WRITES |
+				       INSTANCE_BROADCAST_WRITES);
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 	}
 
-	si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-		       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
-		       INSTANCE_BROADCAST_WRITES);
+	/* GRBM_GFX_INDEX is privileged on VI */
+	if (sctx->b.chip_class <= CIK)
+		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+			       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
+			       INSTANCE_BROADCAST_WRITES);
 }
 
-void si_init_config(struct si_context *sctx)
+static void si_init_config(struct si_context *sctx)
 {
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
+	unsigned raster_config, raster_config_1;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
@@ -3046,24 +3172,18 @@ void si_init_config(struct si_context *sctx)
 
 	si_cmd_context_control(pm4);
 
-	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0);
-	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0);
+	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
 	/* FIXME calculate these values somehow ??? */
 	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, 0x80);
 	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
 	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
 
-	si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0x0);
 	si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
-	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
-	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
-	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
-	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, 0);
-
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
 	si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
@@ -3076,62 +3196,78 @@ void si_init_config(struct si_context *sctx)
 
 	si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
 
-	if (sctx->b.chip_class >= CIK) {
-		switch (sctx->screen->b.family) {
-		case CHIP_BONAIRE:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
-			break;
-		case CHIP_HAWAII:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x3a00161a);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002e);
-			break;
-		case CHIP_KAVERI:
-			/* XXX todo */
-		case CHIP_KABINI:
-			/* XXX todo */
-		case CHIP_MULLINS:
-			/* XXX todo */
-		default:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
-			break;
-		}
-	} else {
-		unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-		unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-		unsigned raster_config;
-
-		switch (sctx->screen->b.family) {
-		case CHIP_TAHITI:
-		case CHIP_PITCAIRN:
-			raster_config = 0x2a00126a;
-			break;
-		case CHIP_VERDE:
-			raster_config = 0x0000124a;
-			break;
-		case CHIP_OLAND:
-			raster_config = 0x00000082;
-			break;
-		case CHIP_HAINAN:
-			raster_config = 0;
-			break;
-		default:
-			fprintf(stderr,
-				"radeonsi: Unknown GPU, using 0 for raster_config\n");
-			raster_config = 0;
-			break;
-		}
+	switch (sctx->screen->b.family) {
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+		raster_config = 0x2a00126a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_VERDE:
+		raster_config = 0x0000124a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_OLAND:
+		raster_config = 0x00000082;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAINAN:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_BONAIRE:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAWAII:
+		raster_config = 0x3a00161a;
+		raster_config_1 = 0x0000002e;
+		break;
+	case CHIP_FIJI:
+		/* Fiji should be same as Hawaii, but that causes corruption in some cases */
+		raster_config = 0x16000012; /* 0x3a00161a */
+		raster_config_1 = 0x0000002a; /* 0x0000002e */
+		break;
+	case CHIP_TONGA:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_ICELAND:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_CARRIZO:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KAVERI:
+		/* KV should be 0x00000002, but that causes problems with radeon */
+		raster_config = 0x00000000; /* 0x00000002 */
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	default:
+		fprintf(stderr,
+			"radeonsi: Unknown GPU, using 0 for raster_config\n");
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	}
 
-		/* Always use the default config when all backends are enabled
-		 * (or when we failed to determine the enabled backends).
-		 */
-		if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
-				       raster_config);
-		} else {
-			si_write_harvested_raster_configs(sctx, pm4, raster_config);
-		}
+	/* Always use the default config when all backends are enabled
+	 * (or when we failed to determine the enabled backends).
+	 */
+	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
+			       raster_config);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
+				       raster_config_1);
+	} else {
+		si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
 	}
 
 	si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
@@ -3153,8 +3289,6 @@ void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, 0);
-	si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, 0);
 	si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0);
 	si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
 	si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
@@ -3173,10 +3307,21 @@ void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
 
 	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffc));
+		si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
+		si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xfffe));
+		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0));
 		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
 	}
 
+	if (sctx->b.chip_class >= VI) {
+		si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL,
+			       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30);
+		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
+	}
+
 	sctx->init_config = pm4;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 5e68b162137..b8f63c5dd36 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,6 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
+#define SI_NUM_SHADERS (PIPE_SHADER_TESS_EVAL+1)
+
 struct si_screen;
 struct si_shader;
 
@@ -92,18 +94,21 @@ union si_state {
 		struct si_pm4_state		*blend_color;
 		struct si_pm4_state		*clip;
 		struct si_state_sample_mask	*sample_mask;
-		struct si_state_scissor		*scissor;
-		struct si_state_viewport	*viewport;
+		struct si_state_scissor		*scissor[16];
+		struct si_state_viewport	*viewport[16];
 		struct si_state_rasterizer	*rasterizer;
 		struct si_state_dsa		*dsa;
 		struct si_pm4_state		*fb_rs;
 		struct si_pm4_state		*fb_blend;
 		struct si_pm4_state		*dsa_stencil_ref;
 		struct si_pm4_state		*ta_bordercolor_base;
+		struct si_pm4_state		*ls;
+		struct si_pm4_state		*hs;
 		struct si_pm4_state		*es;
 		struct si_pm4_state		*gs;
 		struct si_pm4_state		*gs_rings;
-		struct si_pm4_state		*gs_onoff;
+		struct si_pm4_state		*tf_ring;
+		struct si_pm4_state		*vgt_shader_config;
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
 		struct si_pm4_state		*spi;
@@ -111,6 +116,11 @@ union si_state {
 	struct si_pm4_state	*array[0];
 };
 
+struct si_shader_data {
+	struct r600_atom	atom;
+	uint32_t		sh_base[SI_NUM_SHADERS];
+};
+
 #define SI_NUM_USER_SAMPLERS            16 /* AKA OpenGL textures units per shader */
 #define SI_POLY_STIPPLE_SAMPLER         SI_NUM_USER_SAMPLERS
 #define SI_NUM_SAMPLERS                 (SI_POLY_STIPPLE_SAMPLER + 1)
@@ -135,68 +145,61 @@ union si_state {
  * Ring buffers:        0..1
  * Streamout buffers:   2..5
  */
-#define SI_RING_ESGS		0
-#define SI_RING_GSVS		1
-#define SI_NUM_RING_BUFFERS	2
+#define SI_RING_TESS_FACTOR	0 /* for HS (TCS)  */
+#define SI_RING_ESGS		0 /* for ES, GS */
+#define SI_RING_GSVS		1 /* for GS, VS */
+#define SI_RING_GSVS_1		2 /* 1, 2, 3 for GS */
+#define SI_RING_GSVS_2		3
+#define SI_RING_GSVS_3		4
+#define SI_NUM_RING_BUFFERS	5
 #define SI_SO_BUF_OFFSET	SI_NUM_RING_BUFFERS
 #define SI_NUM_RW_BUFFERS	(SI_SO_BUF_OFFSET + 4)
 
 #define SI_NUM_VERTEX_BUFFERS	16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	struct r600_atom atom;
-
-	/* The size of one resource descriptor. */
+	/* The list of descriptors in malloc'd memory. */
+	uint32_t *list;
+	/* The size of one descriptor. */
 	unsigned element_dw_size;
-	/* The maximum number of resource descriptors. */
+	/* The maximum number of descriptors. */
 	unsigned num_elements;
+	/* Whether the list has been changed and should be re-uploaded. */
+	bool list_dirty;
 
-	/* The buffer where resource descriptors are stored. */
+	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	unsigned buffer_offset;
 
-	/* The i-th bit is set if that element is dirty (changed but not emitted). */
-	uint64_t dirty_mask;
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	uint64_t enabled_mask;
 
-	/* We can't update descriptors directly because the GPU might be
-	 * reading them at the same time, so we have to update them
-	 * in a copy-on-write manner. Each such copy is called a context,
-	 * which is just another array descriptors in the same buffer. */
-	unsigned current_context_id;
-	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-	unsigned context_size;
-
-	/* The shader userdata register where the 64-bit pointer to the descriptor
+	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
-	unsigned shader_userdata_reg;
+	unsigned shader_userdata_offset;
+	/* Whether the pointer should be re-emitted. */
+	bool pointer_dirty;
 };
 
 struct si_sampler_views {
 	struct si_descriptors		desc;
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLER_VIEWS];
-	uint32_t			*desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
 	struct si_descriptors		desc;
-	uint32_t			*desc_data[SI_NUM_SAMPLER_STATES];
 	void				*saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
 	struct si_descriptors		desc;
-	unsigned			num_buffers;
 	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
 	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	uint32_t			*desc_storage; /* this has num_buffers*4 elements */
-	uint32_t			**desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -232,20 +235,18 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride);
+			unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_shader_change_notify(struct si_context *sctx);
 
 /* si_state.c */
 struct si_shader_selector;
@@ -256,7 +257,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage);
 void si_init_state_functions(struct si_context *sctx);
-void si_init_config(struct si_context *sctx);
 unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);
@@ -264,6 +264,12 @@ unsigned cik_tile_split(unsigned tile_split);
 unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level);
 
 /* si_state_shader.c */
 void si_update_shaders(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2e77d85a80d..4c21655596c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -31,6 +31,7 @@
 
 #include "util/u_index_modify.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
 
 static void si_decompress_textures(struct si_context *sctx)
 {
@@ -64,6 +65,7 @@ static unsigned si_conv_pipe_prim(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
+		[PIPE_PRIM_PATCHES]			= V_008958_DI_PT_PATCH,
 		[R600_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
         };
 	assert(mode < Elements(prim_conv));
@@ -87,6 +89,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
 		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
 	};
 	assert(mode < Elements(prim_conv));
@@ -94,8 +97,128 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 	return prim_conv[mode];
 }
 
+/**
+ * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
+ * LS.LDS_SIZE is shared by all 3 shader stages.
+ *
+ * The information about LDS and other non-compile-time parameters is then
+ * written to userdata SGPRs.
+ */
+static void si_emit_derived_tess_state(struct si_context *sctx,
+				       const struct pipe_draw_info *info,
+				       unsigned *num_patches)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct si_shader_selector *ls = sctx->vs_shader;
+	/* The TES pointer will only be used for sctx->last_tcs.
+	 * It would be wrong to think that TCS = TES. */
+	struct si_shader_selector *tcs =
+		sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader;
+	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+	unsigned num_tcs_input_cp = info->vertices_per_patch;
+	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+	unsigned num_tcs_patch_outputs;
+	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+	unsigned input_patch_size, output_patch_size, output_patch0_offset;
+	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+
+	*num_patches = 1; /* TODO: calculate this */
+
+	if (sctx->last_ls == ls->current &&
+	    sctx->last_tcs == tcs &&
+	    sctx->last_tes_sh_base == tes_sh_base &&
+	    sctx->last_num_tcs_input_cp == num_tcs_input_cp)
+		return;
+
+	sctx->last_ls = ls->current;
+	sctx->last_tcs = tcs;
+	sctx->last_tes_sh_base = tes_sh_base;
+	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+
+	/* This calculates how shader inputs and outputs among VS, TCS, and TES
+	 * are laid out in LDS. */
+	num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+	if (sctx->tcs_shader) {
+		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+	} else {
+		/* No TCS. Route varyings from LS to TES. */
+		num_tcs_outputs = num_tcs_inputs;
+		num_tcs_output_cp = num_tcs_input_cp;
+		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+	}
+
+	input_vertex_size = num_tcs_inputs * 16;
+	output_vertex_size = num_tcs_outputs * 16;
+
+	input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+	output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0;
+	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+	lds_size = output_patch0_offset + output_patch_size * *num_patches;
+	ls_rsrc2 = ls->current->ls_rsrc2;
+
+	if (sctx->b.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512);
+	} else {
+		assert(lds_size <= 32768);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
+	}
+
+	/* Due to a hw bug, RSRC2_LS must be written twice with another
+	 * LS register written in between. */
+	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+		si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+	si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls_rsrc2);
+
+	/* Compute userdata SGPRs. */
+	assert(((input_vertex_size / 4) & ~0xff) == 0);
+	assert(((output_vertex_size / 4) & ~0xff) == 0);
+	assert(((input_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+	assert(num_tcs_input_cp <= 32);
+	assert(num_tcs_output_cp <= 32);
+
+	tcs_in_layout = (input_patch_size / 4) |
+			((input_vertex_size / 4) << 13);
+	tcs_out_layout = (output_patch_size / 4) |
+			 ((output_vertex_size / 4) << 13);
+	tcs_out_offsets = (output_patch0_offset / 16) |
+			  ((perpatch_output_offset / 16) << 16);
+
+	/* Set them for LS. */
+	si_write_sh_reg(cs,
+		R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4,
+		tcs_in_layout);
+
+	/* Set them for TCS. */
+	si_write_sh_reg_seq(cs,
+		R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OUT_OFFSETS * 4, 3);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+	radeon_emit(cs, tcs_in_layout);
+
+	/* Set them for TES. */
+	si_write_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OUT_OFFSETS * 4, 2);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
-					  const struct pipe_draw_info *info)
+					  const struct pipe_draw_info *info,
+					  unsigned num_patches)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned prim = info->mode;
@@ -104,11 +227,41 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	/* SWITCH_ON_EOP(0) is always preferable. */
 	bool wd_switch_on_eop = false;
 	bool ia_switch_on_eop = false;
+	bool ia_switch_on_eoi = false;
 	bool partial_vs_wave = false;
+	bool partial_es_wave = false;
 
 	if (sctx->gs_shader)
 		primgroup_size = 64; /* recommended with a GS */
 
+	if (sctx->tes_shader) {
+		unsigned num_cp_out =
+			sctx->tcs_shader ?
+			sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+			info->vertices_per_patch;
+		unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
+
+		primgroup_size = MIN2(primgroup_size, max_size);
+
+		/* primgroup_size must be set to a multiple of NUM_PATCHES */
+		primgroup_size = (primgroup_size / num_patches) * num_patches;
+
+		/* SWITCH_ON_EOI must be set if PrimID is used.
+		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+		if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) ||
+		    sctx->tes_shader->info.uses_primid) {
+			ia_switch_on_eoi = true;
+			partial_es_wave = true;
+		}
+
+		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+		if ((sctx->b.family == CHIP_TAHITI ||
+		     sctx->b.family == CHIP_PITCAIRN ||
+		     sctx->b.family == CHIP_BONAIRE) &&
+		    sctx->gs_shader)
+			partial_vs_wave = true;
+	}
+
 	/* This is a hardware requirement. */
 	if ((rs && rs->line_stipple_enable) ||
 	    (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) {
@@ -139,14 +292,52 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    (info->indirect || info->instance_count > 1))
 			wd_switch_on_eop = true;
 
+		/* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */
+		if (info->count_from_stream_output)
+			wd_switch_on_eop = true;
+
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
 
+	/* Hw bug with single-primitive instances and SWITCH_ON_EOI
+	 * on multi-SE chips. */
+	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
+	    (info->indirect ||
+	     (info->instance_count > 1 &&
+	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+
+	/* Instancing bug on 2 SE chips. */
+	if (sctx->b.screen->info.max_se == 2 && ia_switch_on_eoi &&
+	    (info->indirect || info->instance_count > 1))
+		partial_vs_wave = true;
+
 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
+		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
 		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
-		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0);
+		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
+		S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ? 2 : 0);
+}
+
+static unsigned si_get_ls_hs_config(struct si_context *sctx,
+				    const struct pipe_draw_info *info,
+				    unsigned num_patches)
+{
+	unsigned num_output_cp;
+
+	if (!sctx->tes_shader)
+		return 0;
+
+	num_output_cp = sctx->tcs_shader ?
+		sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+		info->vertices_per_patch;
+
+	return S_028B58_NUM_PATCHES(num_patches) |
+		S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) |
+		S_028B58_HS_NUM_OUTPUT_CP(num_output_cp);
 }
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
@@ -202,22 +393,31 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
-	unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info);
+	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
+
+	if (sctx->tes_shader)
+		si_emit_derived_tess_state(sctx, info, &num_patches);
+
+	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
+	ls_hs_config = si_get_ls_hs_config(sctx, info, num_patches);
 
 	/* Draw state. */
 	if (prim != sctx->last_prim ||
-	    ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+	    ia_multi_vgt_param != sctx->last_multi_vgt_param ||
+	    ls_hs_config != sctx->last_ls_hs_config) {
 		if (sctx->b.chip_class >= CIK) {
 			radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
 			radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
 			radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
-			radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */
+			radeon_emit(cs, ls_hs_config); /* VGT_LS_HS_CONFIG */
 		} else {
 			r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
 			r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+			r600_write_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
 		}
 		sctx->last_prim = prim;
 		sctx->last_multi_vgt_param = ia_multi_vgt_param;
+		sctx->last_ls_hs_config = ls_hs_config;
 	}
 
 	if (gs_out_prim != sctx->last_gs_out_prim) {
@@ -245,8 +445,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_index_buffer *ib)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-						  R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
@@ -275,12 +474,24 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	if (info->indexed) {
 		radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
 
-		if (ib->index_size == 4) {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_32 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
-		} else {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_16 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+		/* index type */
+		switch (ib->index_size) {
+		case 1:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_8);
+			break;
+		case 2:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_16 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+			break;
+		case 4:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_32 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
+			break;
+		default:
+			assert(!"unreachable");
+			return;
 		}
 	}
 
@@ -406,9 +617,14 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato
 
 	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2)
+	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
+		/* TODO: this might not be needed. */
+		if (sctx->chip_class >= VI)
+			cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
+	}
+
 	if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
 		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
 				 S_0085F0_CB0_DEST_BASE_ENA(1) |
@@ -520,8 +736,14 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader || !sctx->vs_shader)
+	if (!sctx->ps_shader || !sctx->vs_shader) {
+		assert(0);
 		return;
+	}
+	if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) {
+		assert(0);
+		return;
+	}
 
 	si_decompress_textures(sctx);
 
@@ -532,15 +754,15 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 * current_rast_prim for this draw_vbo call. */
 	if (sctx->gs_shader)
 		sctx->current_rast_prim = sctx->gs_shader->gs_output_prim;
+	else if (sctx->tes_shader)
+		sctx->current_rast_prim =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		sctx->current_rast_prim = info->mode;
 
 	si_update_shaders(sctx);
-
-	if (sctx->vertex_buffers_dirty) {
-		si_update_vertex_buffers(sctx);
-		sctx->vertex_buffers_dirty = false;
-	}
+	if (!si_upload_shader_descriptors(sctx))
+		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
@@ -550,7 +772,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		ib.offset = sctx->index_buffer.offset;
 
 		/* Translate or upload, if needed. */
-		if (ib.index_size == 1) {
+		/* 8-bit indices are supported on VI. */
+		if (sctx->b.chip_class <= CIK && ib.index_size == 1) {
 			struct pipe_resource *out_buffer = NULL;
 			unsigned out_offset, start, count, start_offset;
 			void *ptr;
@@ -585,6 +808,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		}
 	}
 
+	/* TODO: VI should read index buffers through TC, so this shouldn't be
+	 * needed on VI. */
 	if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
 		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
@@ -592,7 +817,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Check flush flags. */
 	if (sctx->b.flags)
-		sctx->atoms.s.cache_flush->dirty = true;
+		si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush);
 
 	si_need_cs_space(sctx, 0, TRUE);
 
@@ -618,7 +843,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if (sctx->b.family == CHIP_HAWAII &&
+	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 208c8523ef1..0347014948d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -30,9 +30,135 @@
 #include "sid.h"
 
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
+static void si_set_tesseval_regs(struct si_shader *shader,
+				 struct si_pm4_state *pm4)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+	unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+	bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+	bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+	unsigned type, partitioning, topology;
+
+	switch (tes_prim_mode) {
+	case PIPE_PRIM_LINES:
+		type = V_028B6C_TESS_ISOLINE;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		type = V_028B6C_TESS_TRIANGLE;
+		break;
+	case PIPE_PRIM_QUADS:
+		type = V_028B6C_TESS_QUAD;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	switch (tes_spacing) {
+	case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+		partitioning = V_028B6C_PART_FRAC_ODD;
+		break;
+	case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+		partitioning = V_028B6C_PART_FRAC_EVEN;
+		break;
+	case PIPE_TESS_SPACING_EQUAL:
+		partitioning = V_028B6C_PART_INTEGER;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	if (tes_point_mode)
+		topology = V_028B6C_OUTPUT_POINT;
+	else if (tes_prim_mode == PIPE_PRIM_LINES)
+		topology = V_028B6C_OUTPUT_LINE;
+	else if (tes_vertex_order_cw)
+		/* for some reason, this must be the other way around */
+		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+	else
+		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+	si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM,
+		       S_028B6C_TYPE(type) |
+		       S_028B6C_PARTITIONING(partitioning) |
+		       S_028B6C_TOPOLOGY(topology));
+}
+
+static void si_shader_ls(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	unsigned vgpr_comp_cnt;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	/* We need at least 2 components for LS.
+	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
+	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
+
+	num_user_sgprs = SI_LS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	if (num_user_sgprs > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
+
+	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
+		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
+	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+}
+
+static void si_shader_hs(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
+	 * buffer offset. */
+	if ((num_user_sgprs + 1) > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 1 + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
+	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_SGPRS((num_sgprs - 1) / 8));
+	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
+		       S_00B42C_USER_SGPR(num_user_sgprs) |
+		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+}
+
 static void si_shader_es(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
@@ -48,9 +174,15 @@ static void si_shader_es(struct si_shader *shader)
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+	if (shader->selector->type == PIPE_SHADER_VERTEX) {
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
+	} else
+		assert(0);
 
-	num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	num_sgprs = shader->num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -69,17 +201,37 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
 		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
+}
+
+static unsigned si_gs_get_max_stream(struct si_shader *shader)
+{
+	struct pipe_stream_output_info *so = &shader->selector->so;
+	unsigned max_stream = 0, i;
+
+	if (so->num_outputs == 0)
+		return 0;
+
+	for (i = 0; i < so->num_outputs; i++) {
+		if (so->output[i].stream > max_stream)
+			max_stream = so->output[i].stream;
+	}
+	return max_stream;
 }
 
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
+	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
+	unsigned max_stream = si_gs_get_max_stream(shader);
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -107,16 +259,23 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_028A40_GS_WRITE_OPTIMIZE(1));
 
 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
+	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
+	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
 	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->gs_used_inputs) * (16 >> 2));
-	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
+		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
+	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
 
-	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
+	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
+	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
+
+	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
+		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
+		       S_028B90_ENABLE(gs_num_invocations > 0));
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
@@ -143,19 +302,29 @@ static void si_shader_gs(struct si_shader *shader)
 
 static void si_shader_vs(struct si_shader *shader)
 {
-	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
-	unsigned nparams, i, vgpr_comp_cnt;
+	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	bool enable_prim_id = si_vs_exports_prim_id(shader);
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
 		return;
 
+	/* If this is the GS copy shader, the GS state writes this register.
+	 * Otherwise, the VS state writes it.
+	 */
+	if (!shader->is_gs_copy_shader) {
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
+			       S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
+	} else
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
+
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
@@ -163,8 +332,11 @@ static void si_shader_vs(struct si_shader *shader)
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
 	} else
 		assert(0);
 
@@ -175,28 +347,8 @@ static void si_shader_vs(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
-	/* Certain attributes (position, psize, etc.) don't count as params.
-	 * VS is required to export at least one param and r600_shader_from_tgsi()
-	 * takes care of adding a dummy export.
-	 */
-	for (nparams = 0, i = 0 ; i < info->num_outputs; i++) {
-		switch (info->output_semantic_name[i]) {
-		case TGSI_SEMANTIC_CLIPVERTEX:
-		case TGSI_SEMANTIC_CLIPDIST:
-		case TGSI_SEMANTIC_CULLDIST:
-		case TGSI_SEMANTIC_POSITION:
-		case TGSI_SEMANTIC_PSIZE:
-		case TGSI_SEMANTIC_EDGEFLAG:
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		case TGSI_SEMANTIC_LAYER:
-			break;
-		default:
-			nparams++;
-		}
-	}
-	if (nparams < 1)
-		nparams = 1;
-
+	/* VS is required to export at least one param. */
+	nparams = MAX2(shader->nr_param_exports, 1);
 	si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
 		       S_0286C4_VS_EXPORT_COUNT(nparams - 1));
 
@@ -236,6 +388,9 @@ static void si_shader_vs(struct si_shader *shader)
 			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
 			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
 }
 
 static void si_shader_ps(struct si_shader *shader)
@@ -333,7 +488,18 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 
 	switch (shader->selector->type) {
 	case PIPE_SHADER_VERTEX:
-		if (shader->key.vs.as_es)
+		if (shader->key.vs.as_ls)
+			si_shader_ls(shader);
+		else if (shader->key.vs.as_es)
+			si_shader_es(shader);
+		else
+			si_shader_vs(shader);
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		si_shader_hs(shader);
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (shader->key.tes.as_es)
 			si_shader_es(shader);
 		else
 			si_shader_vs(shader);
@@ -351,7 +517,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE void si_shader_selector_key(struct pipe_context *ctx,
+static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
 					  union si_shader_key *key)
 {
@@ -367,10 +533,27 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 				key->vs.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
 
-		if (sctx->gs_shader) {
+		if (sctx->tes_shader)
+			key->vs.as_ls = 1;
+		else if (sctx->gs_shader) {
 			key->vs.as_es = 1;
-			key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs;
+			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
 		}
+
+		if (!sctx->gs_shader && sctx->ps_shader &&
+		    sctx->ps_shader->info.uses_primid)
+			key->vs.export_prim_id = 1;
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		key->tcs.prim_mode =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (sctx->gs_shader) {
+			key->tes.as_es = 1;
+			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
+		} else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid)
+			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		break;
@@ -468,6 +651,7 @@ static int si_shader_select(struct pipe_context *ctx,
 		}
 		si_shader_init_pm4_state(shader);
 		sel->num_shaders++;
+		p_atomic_inc(&sctx->screen->b.num_compilations);
 	}
 
 	return 0;
@@ -485,6 +669,7 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	sel->so = state->stream_output;
 	tgsi_scan_shader(state->tokens, &sel->info);
+	p_atomic_inc(&sscreen->b.num_shaders_created);
 
 	switch (pipe_shader_type) {
 	case PIPE_SHADER_GEOMETRY:
@@ -492,6 +677,8 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
 		sel->gs_max_out_vertices =
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+		sel->gs_num_invocations =
+			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
 
 		for (i = 0; i < sel->info.num_inputs; i++) {
 			unsigned name = sel->info.input_semantic_name[i];
@@ -501,10 +688,31 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			case TGSI_SEMANTIC_PRIMID:
 				break;
 			default:
-				sel->gs_used_inputs |=
+				sel->inputs_read |=
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		break;
+
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_TESS_CTRL:
+		for (i = 0; i < sel->info.num_outputs; i++) {
+			unsigned name = sel->info.output_semantic_name[i];
+			unsigned index = sel->info.output_semantic_index[i];
+
+			switch (name) {
+			case TGSI_SEMANTIC_TESSINNER:
+			case TGSI_SEMANTIC_TESSOUTER:
+			case TGSI_SEMANTIC_PATCH:
+				sel->patch_outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+				break;
+			default:
+				sel->outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+			}
+		}
+		break;
 	}
 
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE)
@@ -531,6 +739,18 @@ static void *si_create_vs_state(struct pipe_context *ctx,
 	return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX);
 }
 
+static void *si_create_tcs_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL);
+}
+
+static void *si_create_tes_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL);
+}
+
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -540,20 +760,58 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->vs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->gs_shader != !!sel;
 
 	if (sctx->gs_shader == sel)
 		return;
 
 	sctx->gs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed)
+		si_shader_change_notify(sctx);
+}
+
+static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tcs_shader != !!sel;
+
+	if (sctx->tcs_shader == sel)
+		return;
+
+	sctx->tcs_shader = sel;
+
+	if (enable_changed)
+		sctx->last_tcs = NULL; /* invalidate derived tess state */
+}
+
+static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tes_shader != !!sel;
+
+	if (sctx->tes_shader == sel)
+		return;
+
+	sctx->tes_shader = sel;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed) {
+		si_shader_change_notify(sctx);
+		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+	}
 }
 
 static void si_make_dummy_ps(struct si_context *sctx)
@@ -594,7 +852,18 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 		c = p->next_variant;
 		switch (sel->type) {
 		case PIPE_SHADER_VERTEX:
-			if (p->key.vs.as_es)
+			if (p->key.vs.as_ls)
+				si_pm4_delete_state(sctx, ls, p->pm4);
+			else if (p->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, p->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			si_pm4_delete_state(sctx, hs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (p->key.tes.as_es)
 				si_pm4_delete_state(sctx, es, p->pm4);
 			else
 				si_pm4_delete_state(sctx, vs, p->pm4);
@@ -653,6 +922,30 @@ static void si_delete_ps_shader(struct pipe_context *ctx, void *state)
 	si_delete_shader_selector(ctx, sel);
 }
 
+static void si_delete_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tcs_shader == sel) {
+		sctx->tcs_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
+static void si_delete_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tes_shader == sel) {
+		sctx->tes_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
 static void si_update_spi_map(struct si_context *sctx)
 {
 	struct si_shader *ps = sctx->ps_shader->current;
@@ -694,7 +987,10 @@ bcolor:
 			}
 		}
 
-		if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
+		if (name == TGSI_SEMANTIC_PRIMID)
+			/* PrimID is written after the last output. */
+			tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+		else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
 			/* No corresponding output found, load defaults into input.
 			 * Don't set any other bits.
 			 * (FLAT_SHADE=1 completely changes behavior) */
@@ -720,7 +1016,7 @@ bcolor:
 static void si_init_gs_rings(struct si_context *sctx)
 {
 	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 64 * 1024 * 1024;
+	unsigned gsvs_ring_size = 60 * 1024 * 1024;
 
 	assert(!sctx->gs_rings);
 	sctx->gs_rings = CALLOC_STRUCT(si_pm4_state);
@@ -732,6 +1028,12 @@ static void si_init_gs_rings(struct si_context *sctx)
 					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
 
 	if (sctx->b.chip_class >= CIK) {
+		if (sctx->b.chip_class >= VI) {
+			/* The maximum sizes are 63.999 MB on VI, because
+			 * the register fields only have 18 bits. */
+			assert(esgs_ring_size / 256 < (1 << 18));
+			assert(gsvs_ring_size / 256 < (1 << 18));
+		}
 		si_pm4_set_reg(sctx->gs_rings, R_030900_VGT_ESGS_RING_SIZE,
 			       esgs_ring_size / 256);
 		si_pm4_set_reg(sctx->gs_rings, R_030904_VGT_GSVS_RING_SIZE,
@@ -745,15 +1047,42 @@ static void si_init_gs_rings(struct si_context *sctx)
 
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64);
+			   true, true, 4, 64, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
 			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 }
 
+static void si_update_gs_rings(struct si_context *sctx)
+{
+	unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16;
+	unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices;
+	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	uint64_t offset;
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, 0);
+
+	offset = gsvs_itemsize * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 2) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 3) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+}
 /**
  * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
  *          otherwise.
@@ -763,7 +1092,6 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 {
 	struct si_shader *shader;
 	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
-	unsigned char *ptr;
 
 	if (!sel)
 		return 0;
@@ -784,12 +1112,7 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 	si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
 
 	/* Replace the shader bo with a new bo that has the relocs applied. */
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE,
-					       shader->binary.code_size);
-	ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
-	util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
-	sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	si_shader_binary_upload(sctx->screen, shader);
 
 	/* Update the shader state to use the new shader bo. */
 	si_shader_init_pm4_state(shader);
@@ -818,10 +1141,14 @@ static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
-
-	return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	unsigned bytes = 0;
+
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader));
+	return bytes;
 }
 
 static void si_update_spi_tmpring_size(struct si_context *sctx)
@@ -855,15 +1182,29 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
 		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
 			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+		if (si_update_scratch_buffer(sctx, sctx->tcs_shader))
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
 
-		/* VS can be bound as ES or VS. */
-		if (sctx->gs_shader) {
+		/* VS can be bound as LS, ES, or VS. */
+		if (sctx->tes_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+		} else if (sctx->gs_shader) {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
 		} else {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
 		}
+
+		/* TES can be bound as ES or VS. */
+		if (sctx->gs_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
@@ -874,60 +1215,187 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 }
 
+static void si_init_tess_factor_ring(struct si_context *sctx)
+{
+	assert(!sctx->tf_state);
+	sctx->tf_state = CALLOC_STRUCT(si_pm4_state);
+
+	sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+					   PIPE_USAGE_DEFAULT,
+					   32768 * sctx->screen->b.info.max_se);
+	sctx->b.clear_buffer(&sctx->b.b, sctx->tf_ring, 0,
+			     sctx->tf_ring->width0, fui(0), false);
+	assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0);
+
+	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(sctx->tf_state, R_030938_VGT_TF_RING_SIZE,
+			       S_030938_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_030940_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	} else {
+		si_pm4_set_reg(sctx->tf_state, R_008988_VGT_TF_RING_SIZE,
+			       S_008988_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_0089B8_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	}
+	si_pm4_add_bo(sctx->tf_state, r600_resource(sctx->tf_ring),
+		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+	si_pm4_bind_state(sctx, tf_ring, sctx->tf_state);
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
+			   SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
+			   sctx->tf_ring->width0, false, false, 0, 0, 0);
+
+	sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+}
+
+/**
+ * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
+ * VS passes its outputs to TES directly, so the fixed-function shader only
+ * has to write TESSOUTER and TESSINNER.
+ */
+static void si_generate_fixed_func_tcs(struct si_context *sctx)
+{
+	struct ureg_src const0, const1;
+	struct ureg_dst tessouter, tessinner;
+	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+
+	if (!ureg)
+		return; /* if we get here, we're screwed */
+
+	assert(!sctx->fixed_func_tcs_shader);
+
+	ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
+	const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
+				    SI_DRIVER_STATE_CONST_BUF);
+	const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1),
+				    SI_DRIVER_STATE_CONST_BUF);
+
+	tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+	tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+
+	ureg_MOV(ureg, tessouter, const0);
+	ureg_MOV(ureg, tessinner, const1);
+	ureg_END(ureg);
+
+	sctx->fixed_func_tcs_shader =
+		ureg_create_shader_and_destroy(ureg, &sctx->b.b);
+	assert(sctx->fixed_func_tcs_shader);
+}
+
+static void si_update_vgt_shader_config(struct si_context *sctx)
+{
+	/* Calculate the index of the config.
+	 * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
+	unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader;
+	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
+
+	if (!*pm4) {
+		uint32_t stages = 0;
+
+		*pm4 = CALLOC_STRUCT(si_pm4_state);
+
+		if (sctx->tes_shader) {
+			stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
+				  S_028B54_HS_EN(1);
+
+			if (sctx->gs_shader)
+				stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+					  S_028B54_GS_EN(1) |
+				          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+			else
+				stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+		} else if (sctx->gs_shader) {
+			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+				  S_028B54_GS_EN(1) |
+			          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+		}
+
+		si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+	}
+	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+}
+
+static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader)
+{
+	struct pipe_stream_output_info *so = &shader->so;
+	uint32_t enabled_stream_buffers_mask = 0;
+	int i;
+
+	for (i = 0; i < so->num_outputs; i++)
+		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
+	sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
+	sctx->b.streamout.stride_in_dw = shader->so.stride;
+}
+
 void si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-	if (sctx->gs_shader) {
-		si_shader_select(ctx, sctx->gs_shader);
-		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
-		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
+	/* Update stages before GS. */
+	if (sctx->tes_shader) {
+		if (!sctx->tf_state)
+			si_init_tess_factor_ring(sctx);
 
-		sctx->b.streamout.stride_in_dw = sctx->gs_shader->so.stride;
+		/* VS as LS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+
+		if (sctx->tcs_shader) {
+			si_shader_select(ctx, sctx->tcs_shader);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+		} else {
+			if (!sctx->fixed_func_tcs_shader)
+				si_generate_fixed_func_tcs(sctx);
+			si_shader_select(ctx, sctx->fixed_func_tcs_shader);
+			si_pm4_bind_state(sctx, hs,
+					  sctx->fixed_func_tcs_shader->current->pm4);
+		}
 
+		si_shader_select(ctx, sctx->tes_shader);
+		if (sctx->gs_shader) {
+			/* TES as ES */
+			si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			/* TES as VS */
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+			si_update_so(sctx, sctx->tes_shader);
+		}
+	} else if (sctx->gs_shader) {
+		/* VS as ES */
 		si_shader_select(ctx, sctx->vs_shader);
 		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+	} else {
+		/* VS as VS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+		si_update_so(sctx, sctx->vs_shader);
+	}
+
+	/* Update GS. */
+	if (sctx->gs_shader) {
+		si_shader_select(ctx, sctx->gs_shader);
+		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
+		si_update_so(sctx, sctx->gs_shader);
 
 		if (!sctx->gs_rings)
 			si_init_gs_rings(sctx);
+
 		if (sctx->emitted.named.gs_rings != sctx->gs_rings)
 			sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 		si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings);
 
-		si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
-				   sctx->gsvs_ring,
-				   sctx->gs_shader->gs_max_out_vertices *
-				   sctx->gs_shader->info.num_outputs * 16,
-				   64, true, true, 4, 16);
-
-		if (!sctx->gs_on) {
-			sctx->gs_on = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_on, R_028B54_VGT_SHADER_STAGES_EN,
-				       S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
-				       S_028B54_GS_EN(1) |
-				       S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER));
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_on);
+		si_update_gs_rings(sctx);
 	} else {
-		si_shader_select(ctx, sctx->vs_shader);
-		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-
-		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
-
-		if (!sctx->gs_off) {
-			sctx->gs_off = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_off, R_028A40_VGT_GS_MODE, 0);
-			si_pm4_set_reg(sctx->gs_off, R_028B54_VGT_SHADER_STAGES_EN, 0);
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_off);
 		si_pm4_bind_state(sctx, gs_rings, NULL);
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
 	}
 
+	si_update_vgt_shader_config(sctx);
+
 	si_shader_select(ctx, sctx->ps_shader);
 
 	if (!sctx->ps_shader->current) {
@@ -957,29 +1425,35 @@ void si_update_shaders(struct si_context *sctx)
 
 	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
 		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) {
 		sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing;
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 		if (sctx->b.chip_class == SI)
-			sctx->db_render_state.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 
 void si_init_shader_functions(struct si_context *sctx)
 {
 	sctx->b.b.create_vs_state = si_create_vs_state;
+	sctx->b.b.create_tcs_state = si_create_tcs_state;
+	sctx->b.b.create_tes_state = si_create_tes_state;
 	sctx->b.b.create_gs_state = si_create_gs_state;
 	sctx->b.b.create_fs_state = si_create_fs_state;
 
 	sctx->b.b.bind_vs_state = si_bind_vs_shader;
+	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
+	sctx->b.b.bind_tes_state = si_bind_tes_shader;
 	sctx->b.b.bind_gs_state = si_bind_gs_shader;
 	sctx->b.b.bind_fs_state = si_bind_ps_shader;
 
 	sctx->b.b.delete_vs_state = si_delete_vs_shader;
+	sctx->b.b.delete_tcs_state = si_delete_tcs_shader;
+	sctx->b.b.delete_tes_state = si_delete_tes_shader;
 	sctx->b.b.delete_gs_state = si_delete_gs_shader;
 	sctx->b.b.delete_fs_state = si_delete_ps_shader;
 }
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 35d5ee232a0..66fdf35c8af 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -206,6 +206,398 @@
  * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
  */
 
+
+#define R_000E4C_SRBM_STATUS2                                           0x000E4C
+#define   S_000E4C_SDMA_RQ_PENDING(x)                                 (((x) & 0x1) << 0)
+#define   G_000E4C_SDMA_RQ_PENDING(x)                                 (((x) >> 0) & 0x1)
+#define   C_000E4C_SDMA_RQ_PENDING                                    0xFFFFFFFE
+#define   S_000E4C_TST_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E4C_TST_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E4C_TST_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E4C_SDMA1_RQ_PENDING(x)                                (((x) & 0x1) << 2)
+#define   G_000E4C_SDMA1_RQ_PENDING(x)                                (((x) >> 2) & 0x1)
+#define   C_000E4C_SDMA1_RQ_PENDING                                   0xFFFFFFFB
+#define   S_000E4C_VCE0_RQ_PENDING(x)                                 (((x) & 0x1) << 3)
+#define   G_000E4C_VCE0_RQ_PENDING(x)                                 (((x) >> 3) & 0x1)
+#define   C_000E4C_VCE0_RQ_PENDING                                    0xFFFFFFF7
+#define   S_000E4C_VP8_BUSY(x)                                        (((x) & 0x1) << 4)
+#define   G_000E4C_VP8_BUSY(x)                                        (((x) >> 4) & 0x1)
+#define   C_000E4C_VP8_BUSY                                           0xFFFFFFEF
+#define   S_000E4C_SDMA_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E4C_SDMA_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E4C_SDMA_BUSY                                          0xFFFFFFDF
+#define   S_000E4C_SDMA1_BUSY(x)                                      (((x) & 0x1) << 6)
+#define   G_000E4C_SDMA1_BUSY(x)                                      (((x) >> 6) & 0x1)
+#define   C_000E4C_SDMA1_BUSY                                         0xFFFFFFBF
+#define   S_000E4C_VCE0_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E4C_VCE0_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E4C_VCE0_BUSY                                          0xFFFFFF7F
+#define   S_000E4C_XDMA_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E4C_XDMA_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E4C_XDMA_BUSY                                          0xFFFFFEFF
+#define   S_000E4C_CHUB_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E4C_CHUB_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E4C_CHUB_BUSY                                          0xFFFFFDFF
+#define   S_000E4C_SDMA2_BUSY(x)                                      (((x) & 0x1) << 10)
+#define   G_000E4C_SDMA2_BUSY(x)                                      (((x) >> 10) & 0x1)
+#define   C_000E4C_SDMA2_BUSY                                         0xFFFFFBFF
+#define   S_000E4C_SDMA3_BUSY(x)                                      (((x) & 0x1) << 11)
+#define   G_000E4C_SDMA3_BUSY(x)                                      (((x) >> 11) & 0x1)
+#define   C_000E4C_SDMA3_BUSY                                         0xFFFFF7FF
+#define   S_000E4C_SAMSCP_BUSY(x)                                     (((x) & 0x1) << 12)
+#define   G_000E4C_SAMSCP_BUSY(x)                                     (((x) >> 12) & 0x1)
+#define   C_000E4C_SAMSCP_BUSY                                        0xFFFFEFFF
+#define   S_000E4C_ISP_BUSY(x)                                        (((x) & 0x1) << 13)
+#define   G_000E4C_ISP_BUSY(x)                                        (((x) >> 13) & 0x1)
+#define   C_000E4C_ISP_BUSY                                           0xFFFFDFFF
+#define   S_000E4C_VCE1_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E4C_VCE1_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E4C_VCE1_BUSY                                          0xFFFFBFFF
+#define   S_000E4C_ODE_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_000E4C_ODE_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_000E4C_ODE_BUSY                                           0xFFFF7FFF
+#define   S_000E4C_SDMA2_RQ_PENDING(x)                                (((x) & 0x1) << 16)
+#define   G_000E4C_SDMA2_RQ_PENDING(x)                                (((x) >> 16) & 0x1)
+#define   C_000E4C_SDMA2_RQ_PENDING                                   0xFFFEFFFF
+#define   S_000E4C_SDMA3_RQ_PENDING(x)                                (((x) & 0x1) << 17)
+#define   G_000E4C_SDMA3_RQ_PENDING(x)                                (((x) >> 17) & 0x1)
+#define   C_000E4C_SDMA3_RQ_PENDING                                   0xFFFDFFFF
+#define   S_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) & 0x1) << 18)
+#define   G_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) >> 18) & 0x1)
+#define   C_000E4C_SAMSCP_RQ_PENDING                                  0xFFFBFFFF
+#define   S_000E4C_ISP_RQ_PENDING(x)                                  (((x) & 0x1) << 19)
+#define   G_000E4C_ISP_RQ_PENDING(x)                                  (((x) >> 19) & 0x1)
+#define   C_000E4C_ISP_RQ_PENDING                                     0xFFF7FFFF
+#define   S_000E4C_VCE1_RQ_PENDING(x)                                 (((x) & 0x1) << 20)
+#define   G_000E4C_VCE1_RQ_PENDING(x)                                 (((x) >> 20) & 0x1)
+#define   C_000E4C_VCE1_RQ_PENDING                                    0xFFEFFFFF
+#define R_000E50_SRBM_STATUS                                            0x000E50
+#define   S_000E50_UVD_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E50_UVD_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E50_UVD_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E50_SAMMSP_RQ_PENDING(x)                               (((x) & 0x1) << 2)
+#define   G_000E50_SAMMSP_RQ_PENDING(x)                               (((x) >> 2) & 0x1)
+#define   C_000E50_SAMMSP_RQ_PENDING                                  0xFFFFFFFB
+#define   S_000E50_ACP_RQ_PENDING(x)                                  (((x) & 0x1) << 3)
+#define   G_000E50_ACP_RQ_PENDING(x)                                  (((x) >> 3) & 0x1)
+#define   C_000E50_ACP_RQ_PENDING                                     0xFFFFFFF7
+#define   S_000E50_SMU_RQ_PENDING(x)                                  (((x) & 0x1) << 4)
+#define   G_000E50_SMU_RQ_PENDING(x)                                  (((x) >> 4) & 0x1)
+#define   C_000E50_SMU_RQ_PENDING                                     0xFFFFFFEF
+#define   S_000E50_GRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_000E50_GRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_000E50_GRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_000E50_HI_RQ_PENDING(x)                                   (((x) & 0x1) << 6)
+#define   G_000E50_HI_RQ_PENDING(x)                                   (((x) >> 6) & 0x1)
+#define   C_000E50_HI_RQ_PENDING                                      0xFFFFFFBF
+#define   S_000E50_VMC_BUSY(x)                                        (((x) & 0x1) << 8)
+#define   G_000E50_VMC_BUSY(x)                                        (((x) >> 8) & 0x1)
+#define   C_000E50_VMC_BUSY                                           0xFFFFFEFF
+#define   S_000E50_MCB_BUSY(x)                                        (((x) & 0x1) << 9)
+#define   G_000E50_MCB_BUSY(x)                                        (((x) >> 9) & 0x1)
+#define   C_000E50_MCB_BUSY                                           0xFFFFFDFF
+#define   S_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) & 0x1) << 10)
+#define   G_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) >> 10) & 0x1)
+#define   C_000E50_MCB_NON_DISPLAY_BUSY                               0xFFFFFBFF
+#define   S_000E50_MCC_BUSY(x)                                        (((x) & 0x1) << 11)
+#define   G_000E50_MCC_BUSY(x)                                        (((x) >> 11) & 0x1)
+#define   C_000E50_MCC_BUSY                                           0xFFFFF7FF
+#define   S_000E50_MCD_BUSY(x)                                        (((x) & 0x1) << 12)
+#define   G_000E50_MCD_BUSY(x)                                        (((x) >> 12) & 0x1)
+#define   C_000E50_MCD_BUSY                                           0xFFFFEFFF
+#define   S_000E50_VMC1_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E50_VMC1_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E50_VMC1_BUSY                                          0xFFFFDFFF
+#define   S_000E50_SEM_BUSY(x)                                        (((x) & 0x1) << 14)
+#define   G_000E50_SEM_BUSY(x)                                        (((x) >> 14) & 0x1)
+#define   C_000E50_SEM_BUSY                                           0xFFFFBFFF
+#define   S_000E50_ACP_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_000E50_ACP_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_000E50_ACP_BUSY                                           0xFFFEFFFF
+#define   S_000E50_IH_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_000E50_IH_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_000E50_IH_BUSY                                            0xFFFDFFFF
+#define   S_000E50_UVD_BUSY(x)                                        (((x) & 0x1) << 19)
+#define   G_000E50_UVD_BUSY(x)                                        (((x) >> 19) & 0x1)
+#define   C_000E50_UVD_BUSY                                           0xFFF7FFFF
+#define   S_000E50_SAMMSP_BUSY(x)                                     (((x) & 0x1) << 20)
+#define   G_000E50_SAMMSP_BUSY(x)                                     (((x) >> 20) & 0x1)
+#define   C_000E50_SAMMSP_BUSY                                        0xFFEFFFFF
+#define   S_000E50_GCATCL2_BUSY(x)                                    (((x) & 0x1) << 21)
+#define   G_000E50_GCATCL2_BUSY(x)                                    (((x) >> 21) & 0x1)
+#define   C_000E50_GCATCL2_BUSY                                       0xFFDFFFFF
+#define   S_000E50_OSATCL2_BUSY(x)                                    (((x) & 0x1) << 22)
+#define   G_000E50_OSATCL2_BUSY(x)                                    (((x) >> 22) & 0x1)
+#define   C_000E50_OSATCL2_BUSY                                       0xFFBFFFFF
+#define   S_000E50_BIF_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_000E50_BIF_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_000E50_BIF_BUSY                                           0xDFFFFFFF
+#define R_000E54_SRBM_STATUS3                                           0x000E54
+#define   S_000E54_MCC0_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_000E54_MCC0_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_000E54_MCC0_BUSY                                          0xFFFFFFFE
+#define   S_000E54_MCC1_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_000E54_MCC1_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_000E54_MCC1_BUSY                                          0xFFFFFFFD
+#define   S_000E54_MCC2_BUSY(x)                                       (((x) & 0x1) << 2)
+#define   G_000E54_MCC2_BUSY(x)                                       (((x) >> 2) & 0x1)
+#define   C_000E54_MCC2_BUSY                                          0xFFFFFFFB
+#define   S_000E54_MCC3_BUSY(x)                                       (((x) & 0x1) << 3)
+#define   G_000E54_MCC3_BUSY(x)                                       (((x) >> 3) & 0x1)
+#define   C_000E54_MCC3_BUSY                                          0xFFFFFFF7
+#define   S_000E54_MCC4_BUSY(x)                                       (((x) & 0x1) << 4)
+#define   G_000E54_MCC4_BUSY(x)                                       (((x) >> 4) & 0x1)
+#define   C_000E54_MCC4_BUSY                                          0xFFFFFFEF
+#define   S_000E54_MCC5_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E54_MCC5_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E54_MCC5_BUSY                                          0xFFFFFFDF
+#define   S_000E54_MCC6_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_000E54_MCC6_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_000E54_MCC6_BUSY                                          0xFFFFFFBF
+#define   S_000E54_MCC7_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E54_MCC7_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E54_MCC7_BUSY                                          0xFFFFFF7F
+#define   S_000E54_MCD0_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E54_MCD0_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E54_MCD0_BUSY                                          0xFFFFFEFF
+#define   S_000E54_MCD1_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E54_MCD1_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E54_MCD1_BUSY                                          0xFFFFFDFF
+#define   S_000E54_MCD2_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_000E54_MCD2_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_000E54_MCD2_BUSY                                          0xFFFFFBFF
+#define   S_000E54_MCD3_BUSY(x)                                       (((x) & 0x1) << 11)
+#define   G_000E54_MCD3_BUSY(x)                                       (((x) >> 11) & 0x1)
+#define   C_000E54_MCD3_BUSY                                          0xFFFFF7FF
+#define   S_000E54_MCD4_BUSY(x)                                       (((x) & 0x1) << 12)
+#define   G_000E54_MCD4_BUSY(x)                                       (((x) >> 12) & 0x1)
+#define   C_000E54_MCD4_BUSY                                          0xFFFFEFFF
+#define   S_000E54_MCD5_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E54_MCD5_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E54_MCD5_BUSY                                          0xFFFFDFFF
+#define   S_000E54_MCD6_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E54_MCD6_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E54_MCD6_BUSY                                          0xFFFFBFFF
+#define   S_000E54_MCD7_BUSY(x)                                       (((x) & 0x1) << 15)
+#define   G_000E54_MCD7_BUSY(x)                                       (((x) >> 15) & 0x1)
+#define   C_000E54_MCD7_BUSY                                          0xFFFF7FFF
+#define R_00D034_SDMA0_STATUS_REG                                       0x00D034
+#define   S_00D034_IDLE(x)                                            (((x) & 0x1) << 0)
+#define   G_00D034_IDLE(x)                                            (((x) >> 0) & 0x1)
+#define   C_00D034_IDLE                                               0xFFFFFFFE
+#define   S_00D034_REG_IDLE(x)                                        (((x) & 0x1) << 1)
+#define   G_00D034_REG_IDLE(x)                                        (((x) >> 1) & 0x1)
+#define   C_00D034_REG_IDLE                                           0xFFFFFFFD
+#define   S_00D034_RB_EMPTY(x)                                        (((x) & 0x1) << 2)
+#define   G_00D034_RB_EMPTY(x)                                        (((x) >> 2) & 0x1)
+#define   C_00D034_RB_EMPTY                                           0xFFFFFFFB
+#define   S_00D034_RB_FULL(x)                                         (((x) & 0x1) << 3)
+#define   G_00D034_RB_FULL(x)                                         (((x) >> 3) & 0x1)
+#define   C_00D034_RB_FULL                                            0xFFFFFFF7
+#define   S_00D034_RB_CMD_IDLE(x)                                     (((x) & 0x1) << 4)
+#define   G_00D034_RB_CMD_IDLE(x)                                     (((x) >> 4) & 0x1)
+#define   C_00D034_RB_CMD_IDLE                                        0xFFFFFFEF
+#define   S_00D034_RB_CMD_FULL(x)                                     (((x) & 0x1) << 5)
+#define   G_00D034_RB_CMD_FULL(x)                                     (((x) >> 5) & 0x1)
+#define   C_00D034_RB_CMD_FULL                                        0xFFFFFFDF
+#define   S_00D034_IB_CMD_IDLE(x)                                     (((x) & 0x1) << 6)
+#define   G_00D034_IB_CMD_IDLE(x)                                     (((x) >> 6) & 0x1)
+#define   C_00D034_IB_CMD_IDLE                                        0xFFFFFFBF
+#define   S_00D034_IB_CMD_FULL(x)                                     (((x) & 0x1) << 7)
+#define   G_00D034_IB_CMD_FULL(x)                                     (((x) >> 7) & 0x1)
+#define   C_00D034_IB_CMD_FULL                                        0xFFFFFF7F
+#define   S_00D034_BLOCK_IDLE(x)                                      (((x) & 0x1) << 8)
+#define   G_00D034_BLOCK_IDLE(x)                                      (((x) >> 8) & 0x1)
+#define   C_00D034_BLOCK_IDLE                                         0xFFFFFEFF
+#define   S_00D034_INSIDE_IB(x)                                       (((x) & 0x1) << 9)
+#define   G_00D034_INSIDE_IB(x)                                       (((x) >> 9) & 0x1)
+#define   C_00D034_INSIDE_IB                                          0xFFFFFDFF
+#define   S_00D034_EX_IDLE(x)                                         (((x) & 0x1) << 10)
+#define   G_00D034_EX_IDLE(x)                                         (((x) >> 10) & 0x1)
+#define   C_00D034_EX_IDLE                                            0xFFFFFBFF
+#define   S_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) & 0x1) << 11)
+#define   G_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) >> 11) & 0x1)
+#define   C_00D034_EX_IDLE_POLL_TIMER_EXPIRE                          0xFFFFF7FF
+#define   S_00D034_PACKET_READY(x)                                    (((x) & 0x1) << 12)
+#define   G_00D034_PACKET_READY(x)                                    (((x) >> 12) & 0x1)
+#define   C_00D034_PACKET_READY                                       0xFFFFEFFF
+#define   S_00D034_MC_WR_IDLE(x)                                      (((x) & 0x1) << 13)
+#define   G_00D034_MC_WR_IDLE(x)                                      (((x) >> 13) & 0x1)
+#define   C_00D034_MC_WR_IDLE                                         0xFFFFDFFF
+#define   S_00D034_SRBM_IDLE(x)                                       (((x) & 0x1) << 14)
+#define   G_00D034_SRBM_IDLE(x)                                       (((x) >> 14) & 0x1)
+#define   C_00D034_SRBM_IDLE                                          0xFFFFBFFF
+#define   S_00D034_CONTEXT_EMPTY(x)                                   (((x) & 0x1) << 15)
+#define   G_00D034_CONTEXT_EMPTY(x)                                   (((x) >> 15) & 0x1)
+#define   C_00D034_CONTEXT_EMPTY                                      0xFFFF7FFF
+#define   S_00D034_DELTA_RPTR_FULL(x)                                 (((x) & 0x1) << 16)
+#define   G_00D034_DELTA_RPTR_FULL(x)                                 (((x) >> 16) & 0x1)
+#define   C_00D034_DELTA_RPTR_FULL                                    0xFFFEFFFF
+#define   S_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 17)
+#define   G_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) >> 17) & 0x1)
+#define   C_00D034_RB_MC_RREQ_IDLE                                    0xFFFDFFFF
+#define   S_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 18)
+#define   G_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) >> 18) & 0x1)
+#define   C_00D034_IB_MC_RREQ_IDLE                                    0xFFFBFFFF
+#define   S_00D034_MC_RD_IDLE(x)                                      (((x) & 0x1) << 19)
+#define   G_00D034_MC_RD_IDLE(x)                                      (((x) >> 19) & 0x1)
+#define   C_00D034_MC_RD_IDLE                                         0xFFF7FFFF
+#define   S_00D034_DELTA_RPTR_EMPTY(x)                                (((x) & 0x1) << 20)
+#define   G_00D034_DELTA_RPTR_EMPTY(x)                                (((x) >> 20) & 0x1)
+#define   C_00D034_DELTA_RPTR_EMPTY                                   0xFFEFFFFF
+#define   S_00D034_MC_RD_RET_STALL(x)                                 (((x) & 0x1) << 21)
+#define   G_00D034_MC_RD_RET_STALL(x)                                 (((x) >> 21) & 0x1)
+#define   C_00D034_MC_RD_RET_STALL                                    0xFFDFFFFF
+#define   S_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) & 0x1) << 22)
+#define   G_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) >> 22) & 0x1)
+#define   C_00D034_MC_RD_NO_POLL_IDLE                                 0xFFBFFFFF
+#define   S_00D034_PREV_CMD_IDLE(x)                                   (((x) & 0x1) << 25)
+#define   G_00D034_PREV_CMD_IDLE(x)                                   (((x) >> 25) & 0x1)
+#define   C_00D034_PREV_CMD_IDLE                                      0xFDFFFFFF
+#define   S_00D034_SEM_IDLE(x)                                        (((x) & 0x1) << 26)
+#define   G_00D034_SEM_IDLE(x)                                        (((x) >> 26) & 0x1)
+#define   C_00D034_SEM_IDLE                                           0xFBFFFFFF
+#define   S_00D034_SEM_REQ_STALL(x)                                   (((x) & 0x1) << 27)
+#define   G_00D034_SEM_REQ_STALL(x)                                   (((x) >> 27) & 0x1)
+#define   C_00D034_SEM_REQ_STALL                                      0xF7FFFFFF
+#define   S_00D034_SEM_RESP_STATE(x)                                  (((x) & 0x03) << 28)
+#define   G_00D034_SEM_RESP_STATE(x)                                  (((x) >> 28) & 0x03)
+#define   C_00D034_SEM_RESP_STATE                                     0xCFFFFFFF
+#define   S_00D034_INT_IDLE(x)                                        (((x) & 0x1) << 30)
+#define   G_00D034_INT_IDLE(x)                                        (((x) >> 30) & 0x1)
+#define   C_00D034_INT_IDLE                                           0xBFFFFFFF
+#define   S_00D034_INT_REQ_STALL(x)                                   (((x) & 0x1) << 31)
+#define   G_00D034_INT_REQ_STALL(x)                                   (((x) >> 31) & 0x1)
+#define   C_00D034_INT_REQ_STALL                                      0x7FFFFFFF
+#define R_00D834_SDMA1_STATUS_REG                                       0x00D834
+#define R_008008_GRBM_STATUS2                                           0x008008
+#define   S_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008008_ME0PIPE1_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) & 0x1) << 4)
+#define   G_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) >> 4) & 0x1)
+#define   C_008008_ME0PIPE1_CF_RQ_PENDING                             0xFFFFFFEF
+#define   S_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) & 0x1) << 5)
+#define   G_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) >> 5) & 0x1)
+#define   C_008008_ME0PIPE1_PF_RQ_PENDING                             0xFFFFFFDF
+#define   S_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 6)
+#define   G_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) >> 6) & 0x1)
+#define   C_008008_ME1PIPE0_RQ_PENDING                                0xFFFFFFBF
+#define   S_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 7)
+#define   G_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) >> 7) & 0x1)
+#define   C_008008_ME1PIPE1_RQ_PENDING                                0xFFFFFF7F
+#define   S_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 8)
+#define   G_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) >> 8) & 0x1)
+#define   C_008008_ME1PIPE2_RQ_PENDING                                0xFFFFFEFF
+#define   S_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 9)
+#define   G_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) >> 9) & 0x1)
+#define   C_008008_ME1PIPE3_RQ_PENDING                                0xFFFFFDFF
+#define   S_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 10)
+#define   G_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) >> 10) & 0x1)
+#define   C_008008_ME2PIPE0_RQ_PENDING                                0xFFFFFBFF
+#define   S_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 11)
+#define   G_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) >> 11) & 0x1)
+#define   C_008008_ME2PIPE1_RQ_PENDING                                0xFFFFF7FF
+#define   S_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 12)
+#define   G_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) >> 12) & 0x1)
+#define   C_008008_ME2PIPE2_RQ_PENDING                                0xFFFFEFFF
+#define   S_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 13)
+#define   G_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) >> 13) & 0x1)
+#define   C_008008_ME2PIPE3_RQ_PENDING                                0xFFFFDFFF
+#define   S_008008_RLC_RQ_PENDING(x)                                  (((x) & 0x1) << 14)
+#define   G_008008_RLC_RQ_PENDING(x)                                  (((x) >> 14) & 0x1)
+#define   C_008008_RLC_RQ_PENDING                                     0xFFFFBFFF
+#define   S_008008_RLC_BUSY(x)                                        (((x) & 0x1) << 24)
+#define   G_008008_RLC_BUSY(x)                                        (((x) >> 24) & 0x1)
+#define   C_008008_RLC_BUSY                                           0xFEFFFFFF
+#define   S_008008_TC_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008008_TC_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008008_TC_BUSY                                            0xFDFFFFFF
+#define   S_008008_TCC_CC_RESIDENT(x)                                 (((x) & 0x1) << 26)
+#define   G_008008_TCC_CC_RESIDENT(x)                                 (((x) >> 26) & 0x1)
+#define   C_008008_TCC_CC_RESIDENT                                    0xFBFFFFFF
+#define   S_008008_CPF_BUSY(x)                                        (((x) & 0x1) << 28)
+#define   G_008008_CPF_BUSY(x)                                        (((x) >> 28) & 0x1)
+#define   C_008008_CPF_BUSY                                           0xEFFFFFFF
+#define   S_008008_CPC_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_008008_CPC_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_008008_CPC_BUSY                                           0xDFFFFFFF
+#define   S_008008_CPG_BUSY(x)                                        (((x) & 0x1) << 30)
+#define   G_008008_CPG_BUSY(x)                                        (((x) >> 30) & 0x1)
+#define   C_008008_CPG_BUSY                                           0xBFFFFFFF
+#define R_008010_GRBM_STATUS                                            0x008010
+#define   S_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008010_ME0PIPE0_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008010_SRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_008010_SRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_008010_SRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) & 0x1) << 7)
+#define   G_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) >> 7) & 0x1)
+#define   C_008010_ME0PIPE0_CF_RQ_PENDING                             0xFFFFFF7F
+#define   S_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) & 0x1) << 8)
+#define   G_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) >> 8) & 0x1)
+#define   C_008010_ME0PIPE0_PF_RQ_PENDING                             0xFFFFFEFF
+#define   S_008010_GDS_DMA_RQ_PENDING(x)                              (((x) & 0x1) << 9)
+#define   G_008010_GDS_DMA_RQ_PENDING(x)                              (((x) >> 9) & 0x1)
+#define   C_008010_GDS_DMA_RQ_PENDING                                 0xFFFFFDFF
+#define   S_008010_DB_CLEAN(x)                                        (((x) & 0x1) << 12)
+#define   G_008010_DB_CLEAN(x)                                        (((x) >> 12) & 0x1)
+#define   C_008010_DB_CLEAN                                           0xFFFFEFFF
+#define   S_008010_CB_CLEAN(x)                                        (((x) & 0x1) << 13)
+#define   G_008010_CB_CLEAN(x)                                        (((x) >> 13) & 0x1)
+#define   C_008010_CB_CLEAN                                           0xFFFFDFFF
+#define   S_008010_TA_BUSY(x)                                         (((x) & 0x1) << 14)
+#define   G_008010_TA_BUSY(x)                                         (((x) >> 14) & 0x1)
+#define   C_008010_TA_BUSY                                            0xFFFFBFFF
+#define   S_008010_GDS_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008010_GDS_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008010_GDS_BUSY                                           0xFFFF7FFF
+#define   S_008010_WD_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 16)
+#define   G_008010_WD_BUSY_NO_DMA(x)                                  (((x) >> 16) & 0x1)
+#define   C_008010_WD_BUSY_NO_DMA                                     0xFFFEFFFF
+#define   S_008010_VGT_BUSY(x)                                        (((x) & 0x1) << 17)
+#define   G_008010_VGT_BUSY(x)                                        (((x) >> 17) & 0x1)
+#define   C_008010_VGT_BUSY                                           0xFFFDFFFF
+#define   S_008010_IA_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 18)
+#define   G_008010_IA_BUSY_NO_DMA(x)                                  (((x) >> 18) & 0x1)
+#define   C_008010_IA_BUSY_NO_DMA                                     0xFFFBFFFF
+#define   S_008010_IA_BUSY(x)                                         (((x) & 0x1) << 19)
+#define   G_008010_IA_BUSY(x)                                         (((x) >> 19) & 0x1)
+#define   C_008010_IA_BUSY                                            0xFFF7FFFF
+#define   S_008010_SX_BUSY(x)                                         (((x) & 0x1) << 20)
+#define   G_008010_SX_BUSY(x)                                         (((x) >> 20) & 0x1)
+#define   C_008010_SX_BUSY                                            0xFFEFFFFF
+#define   S_008010_WD_BUSY(x)                                         (((x) & 0x1) << 21)
+#define   G_008010_WD_BUSY(x)                                         (((x) >> 21) & 0x1)
+#define   C_008010_WD_BUSY                                            0xFFDFFFFF
+#define   S_008010_SPI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008010_SPI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008010_SPI_BUSY                                           0xFFBFFFFF
+#define   S_008010_BCI_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008010_BCI_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008010_BCI_BUSY                                           0xFF7FFFFF
+#define   S_008010_SC_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008010_SC_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008010_SC_BUSY                                            0xFEFFFFFF
+#define   S_008010_PA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008010_PA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008010_PA_BUSY                                            0xFDFFFFFF
+#define   S_008010_DB_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008010_DB_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008010_DB_BUSY                                            0xFBFFFFFF
+#define   S_008010_CP_COHERENCY_BUSY(x)                               (((x) & 0x1) << 28)
+#define   G_008010_CP_COHERENCY_BUSY(x)                               (((x) >> 28) & 0x1)
+#define   C_008010_CP_COHERENCY_BUSY                                  0xEFFFFFFF
+#define   S_008010_CP_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008010_CP_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008010_CP_BUSY                                            0xDFFFFFFF
+#define   S_008010_CB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008010_CB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008010_CB_BUSY                                            0xBFFFFFFF
+#define   S_008010_GUI_ACTIVE(x)                                      (((x) & 0x1) << 31)
+#define   G_008010_GUI_ACTIVE(x)                                      (((x) >> 31) & 0x1)
+#define   C_008010_GUI_ACTIVE                                         0x7FFFFFFF
 #define GRBM_GFX_INDEX                                                  0x802C
 #define         INSTANCE_INDEX(x)                                     ((x) << 0)
 #define         SH_INDEX(x)                                           ((x) << 8)
@@ -276,12 +668,155 @@
 #define   C_0085F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
 #define R_0085F4_CP_COHER_SIZE                                          0x0085F4
 #define R_0085F8_CP_COHER_BASE                                          0x0085F8
-
+#define R_008014_GRBM_STATUS_SE0                                        0x008014
+#define   S_008014_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008014_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008014_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008014_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008014_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008014_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008014_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008014_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008014_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008014_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008014_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008014_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008014_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008014_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008014_PA_BUSY                                            0xFEFFFFFF
+#define   S_008014_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008014_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008014_TA_BUSY                                            0xFDFFFFFF
+#define   S_008014_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008014_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008014_SX_BUSY                                            0xFBFFFFFF
+#define   S_008014_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008014_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008014_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008014_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008014_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008014_SC_BUSY                                            0xDFFFFFFF
+#define   S_008014_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008014_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008014_DB_BUSY                                            0xBFFFFFFF
+#define   S_008014_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008014_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008014_CB_BUSY                                            0x7FFFFFFF
+#define R_008018_GRBM_STATUS_SE1                                        0x008018
+#define   S_008018_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008018_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008018_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008018_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008018_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008018_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008018_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008018_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008018_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008018_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008018_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008018_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008018_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008018_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008018_PA_BUSY                                            0xFEFFFFFF
+#define   S_008018_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008018_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008018_TA_BUSY                                            0xFDFFFFFF
+#define   S_008018_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008018_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008018_SX_BUSY                                            0xFBFFFFFF
+#define   S_008018_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008018_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008018_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008018_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008018_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008018_SC_BUSY                                            0xDFFFFFFF
+#define   S_008018_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008018_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008018_DB_BUSY                                            0xBFFFFFFF
+#define   S_008018_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008018_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008018_CB_BUSY                                            0x7FFFFFFF
+#define R_008038_GRBM_STATUS_SE2                                        0x008038
+#define   S_008038_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008038_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008038_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008038_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008038_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008038_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008038_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008038_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008038_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008038_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008038_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008038_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008038_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008038_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008038_PA_BUSY                                            0xFEFFFFFF
+#define   S_008038_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008038_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008038_TA_BUSY                                            0xFDFFFFFF
+#define   S_008038_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008038_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008038_SX_BUSY                                            0xFBFFFFFF
+#define   S_008038_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008038_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008038_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008038_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008038_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008038_SC_BUSY                                            0xDFFFFFFF
+#define   S_008038_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008038_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008038_DB_BUSY                                            0xBFFFFFFF
+#define   S_008038_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008038_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008038_CB_BUSY                                            0x7FFFFFFF
+#define R_00803C_GRBM_STATUS_SE3                                        0x00803C
+#define   S_00803C_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_00803C_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_00803C_DB_CLEAN                                           0xFFFFFFFD
+#define   S_00803C_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_00803C_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_00803C_CB_CLEAN                                           0xFFFFFFFB
+#define   S_00803C_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_00803C_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_00803C_BCI_BUSY                                           0xFFBFFFFF
+#define   S_00803C_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_00803C_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_00803C_VGT_BUSY                                           0xFF7FFFFF
+#define   S_00803C_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_00803C_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_00803C_PA_BUSY                                            0xFEFFFFFF
+#define   S_00803C_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_00803C_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_00803C_TA_BUSY                                            0xFDFFFFFF
+#define   S_00803C_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_00803C_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_00803C_SX_BUSY                                            0xFBFFFFFF
+#define   S_00803C_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_00803C_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_00803C_SPI_BUSY                                           0xF7FFFFFF
+#define   S_00803C_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_00803C_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_00803C_SC_BUSY                                            0xDFFFFFFF
+#define   S_00803C_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_00803C_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_00803C_DB_BUSY                                            0xBFFFFFFF
+#define   S_00803C_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_00803C_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_00803C_CB_BUSY                                            0x7FFFFFFF
 /* CIK */
+#define R_0300FC_CP_STRMOUT_CNTL                                        0x0300FC
+#define   S_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) & 0x1) << 0)
+#define   G_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) >> 0) & 0x1)
+#define   C_0300FC_OFFSET_UPDATE_DONE                                 0xFFFFFFFE
 #define R_0301E4_CP_COHER_BASE_HI                                       0x0301E4
 #define   S_0301E4_COHER_BASE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_0301E4_COHER_BASE_HI_256B(x)                              (((x) >> 0) & 0xFF)
 #define   C_0301E4_COHER_BASE_HI_256B                                 0xFFFFFF00
+#define R_0301EC_CP_COHER_START_DELAY                                   0x0301EC
+#define   S_0301EC_START_DELAY_COUNT(x)                               (((x) & 0x3F) << 0)
+#define   G_0301EC_START_DELAY_COUNT(x)                               (((x) >> 0) & 0x3F)
+#define   C_0301EC_START_DELAY_COUNT                                  0xFFFFFFC0
 #define R_0301F0_CP_COHER_CNTL                                          0x0301F0
 #define   S_0301F0_DEST_BASE_0_ENA(x)                                 (((x) & 0x1) << 0)
 #define   G_0301F0_DEST_BASE_0_ENA(x)                                 (((x) >> 0) & 0x1)
@@ -289,6 +824,14 @@
 #define   S_0301F0_DEST_BASE_1_ENA(x)                                 (((x) & 0x1) << 1)
 #define   G_0301F0_DEST_BASE_1_ENA(x)                                 (((x) >> 1) & 0x1)
 #define   C_0301F0_DEST_BASE_1_ENA                                    0xFFFFFFFD
+/* VI */
+#define   S_0301F0_TC_SD_ACTION_ENA(x)                                (((x) & 0x1) << 2)
+#define   G_0301F0_TC_SD_ACTION_ENA(x)                                (((x) >> 2) & 0x1)
+#define   C_0301F0_TC_SD_ACTION_ENA                                   0xFFFFFFFB
+#define   S_0301F0_TC_NC_ACTION_ENA(x)                                (((x) & 0x1) << 3)
+#define   G_0301F0_TC_NC_ACTION_ENA(x)                                (((x) >> 3) & 0x1)
+#define   C_0301F0_TC_NC_ACTION_ENA                                   0xFFFFFFF7
+/*    */
 #define   S_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) & 0x1) << 6)
 #define   G_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) >> 6) & 0x1)
 #define   C_0301F0_CB0_DEST_BASE_ENA                                  0xFFFFFFBF
@@ -319,7 +862,7 @@
 #define   S_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) & 0x1) << 15)
 #define   G_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) >> 15) & 0x1)
 #define   C_0301F0_TCL1_VOL_ACTION_ENA                                0xFFFF7FFF
-#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16)
+#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16) /* not on VI */
 #define   G_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) >> 16) & 0x1)
 #define   C_0301F0_TC_VOL_ACTION_ENA                                  0xFFFEFFFF
 #define   S_0301F0_TC_WB_ACTION_ENA(x)                                (((x) & 0x1) << 18)
@@ -352,8 +895,389 @@
 #define   S_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) & 0x1) << 29)
 #define   G_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) >> 29) & 0x1)
 #define   C_0301F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
+/* VI */
+#define   S_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) & 0x1) << 30)
+#define   G_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) >> 30) & 0x1)
+#define   C_0301F0_SH_KCACHE_WB_ACTION_ENA                            0xBFFFFFFF
+#define   S_0301F0_SH_SD_ACTION_ENA(x)                                (((x) & 0x1) << 31)
+#define   G_0301F0_SH_SD_ACTION_ENA(x)                                (((x) >> 31) & 0x1)
+#define   C_0301F0_SH_SD_ACTION_ENA                                   0x7FFFFFFF
+/*    */
 #define R_0301F4_CP_COHER_SIZE                                          0x0301F4
 #define R_0301F8_CP_COHER_BASE                                          0x0301F8
+#define R_0301FC_CP_COHER_STATUS                                        0x0301FC
+#define   S_0301FC_MATCHING_GFX_CNTX(x)                               (((x) & 0xFF) << 0)
+#define   G_0301FC_MATCHING_GFX_CNTX(x)                               (((x) >> 0) & 0xFF)
+#define   C_0301FC_MATCHING_GFX_CNTX                                  0xFFFFFF00
+#define   S_0301FC_MEID(x)                                            (((x) & 0x03) << 24)
+#define   G_0301FC_MEID(x)                                            (((x) >> 24) & 0x03)
+#define   C_0301FC_MEID                                               0xFCFFFFFF
+#define   S_0301FC_PHASE1_STATUS(x)                                   (((x) & 0x1) << 30)
+#define   G_0301FC_PHASE1_STATUS(x)                                   (((x) >> 30) & 0x1)
+#define   C_0301FC_PHASE1_STATUS                                      0xBFFFFFFF
+#define   S_0301FC_STATUS(x)                                          (((x) & 0x1) << 31)
+#define   G_0301FC_STATUS(x)                                          (((x) >> 31) & 0x1)
+#define   C_0301FC_STATUS                                             0x7FFFFFFF
+#define R_008210_CP_CPC_STATUS                                          0x008210
+#define   S_008210_MEC1_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_008210_MEC1_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_008210_MEC1_BUSY                                          0xFFFFFFFE
+#define   S_008210_MEC2_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_008210_MEC2_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_008210_MEC2_BUSY                                          0xFFFFFFFD
+#define   S_008210_DC0_BUSY(x)                                        (((x) & 0x1) << 2)
+#define   G_008210_DC0_BUSY(x)                                        (((x) >> 2) & 0x1)
+#define   C_008210_DC0_BUSY                                           0xFFFFFFFB
+#define   S_008210_DC1_BUSY(x)                                        (((x) & 0x1) << 3)
+#define   G_008210_DC1_BUSY(x)                                        (((x) >> 3) & 0x1)
+#define   C_008210_DC1_BUSY                                           0xFFFFFFF7
+#define   S_008210_RCIU1_BUSY(x)                                      (((x) & 0x1) << 4)
+#define   G_008210_RCIU1_BUSY(x)                                      (((x) >> 4) & 0x1)
+#define   C_008210_RCIU1_BUSY                                         0xFFFFFFEF
+#define   S_008210_RCIU2_BUSY(x)                                      (((x) & 0x1) << 5)
+#define   G_008210_RCIU2_BUSY(x)                                      (((x) >> 5) & 0x1)
+#define   C_008210_RCIU2_BUSY                                         0xFFFFFFDF
+#define   S_008210_ROQ1_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_008210_ROQ1_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_008210_ROQ1_BUSY                                          0xFFFFFFBF
+#define   S_008210_ROQ2_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_008210_ROQ2_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_008210_ROQ2_BUSY                                          0xFFFFFF7F
+#define   S_008210_TCIU_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_008210_TCIU_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_008210_TCIU_BUSY                                          0xFFFFFBFF
+#define   S_008210_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 11)
+#define   G_008210_SCRATCH_RAM_BUSY(x)                                (((x) >> 11) & 0x1)
+#define   C_008210_SCRATCH_RAM_BUSY                                   0xFFFFF7FF
+#define   S_008210_QU_BUSY(x)                                         (((x) & 0x1) << 12)
+#define   G_008210_QU_BUSY(x)                                         (((x) >> 12) & 0x1)
+#define   C_008210_QU_BUSY                                            0xFFFFEFFF
+#define   S_008210_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 13)
+#define   G_008210_ATCL2IU_BUSY(x)                                    (((x) >> 13) & 0x1)
+#define   C_008210_ATCL2IU_BUSY                                       0xFFFFDFFF
+#define   S_008210_CPG_CPC_BUSY(x)                                    (((x) & 0x1) << 29)
+#define   G_008210_CPG_CPC_BUSY(x)                                    (((x) >> 29) & 0x1)
+#define   C_008210_CPG_CPC_BUSY                                       0xDFFFFFFF
+#define   S_008210_CPF_CPC_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_008210_CPF_CPC_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_008210_CPF_CPC_BUSY                                       0xBFFFFFFF
+#define   S_008210_CPC_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_008210_CPC_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_008210_CPC_BUSY                                           0x7FFFFFFF
+#define R_008214_CP_CPC_BUSY_STAT                                       0x008214
+#define   S_008214_MEC1_LOAD_BUSY(x)                                  (((x) & 0x1) << 0)
+#define   G_008214_MEC1_LOAD_BUSY(x)                                  (((x) >> 0) & 0x1)
+#define   C_008214_MEC1_LOAD_BUSY                                     0xFFFFFFFE
+#define   S_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 1)
+#define   G_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) >> 1) & 0x1)
+#define   C_008214_MEC1_SEMAPOHRE_BUSY                                0xFFFFFFFD
+#define   S_008214_MEC1_MUTEX_BUSY(x)                                 (((x) & 0x1) << 2)
+#define   G_008214_MEC1_MUTEX_BUSY(x)                                 (((x) >> 2) & 0x1)
+#define   C_008214_MEC1_MUTEX_BUSY                                    0xFFFFFFFB
+#define   S_008214_MEC1_MESSAGE_BUSY(x)                               (((x) & 0x1) << 3)
+#define   G_008214_MEC1_MESSAGE_BUSY(x)                               (((x) >> 3) & 0x1)
+#define   C_008214_MEC1_MESSAGE_BUSY                                  0xFFFFFFF7
+#define   S_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 4)
+#define   G_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) >> 4) & 0x1)
+#define   C_008214_MEC1_EOP_QUEUE_BUSY                                0xFFFFFFEF
+#define   S_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 5)
+#define   G_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) >> 5) & 0x1)
+#define   C_008214_MEC1_IQ_QUEUE_BUSY                                 0xFFFFFFDF
+#define   S_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_008214_MEC1_IB_QUEUE_BUSY                                 0xFFFFFFBF
+#define   S_008214_MEC1_TC_BUSY(x)                                    (((x) & 0x1) << 7)
+#define   G_008214_MEC1_TC_BUSY(x)                                    (((x) >> 7) & 0x1)
+#define   C_008214_MEC1_TC_BUSY                                       0xFFFFFF7F
+#define   S_008214_MEC1_DMA_BUSY(x)                                   (((x) & 0x1) << 8)
+#define   G_008214_MEC1_DMA_BUSY(x)                                   (((x) >> 8) & 0x1)
+#define   C_008214_MEC1_DMA_BUSY                                      0xFFFFFEFF
+#define   S_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 9)
+#define   G_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 9) & 0x1)
+#define   C_008214_MEC1_PARTIAL_FLUSH_BUSY                            0xFFFFFDFF
+#define   S_008214_MEC1_PIPE0_BUSY(x)                                 (((x) & 0x1) << 10)
+#define   G_008214_MEC1_PIPE0_BUSY(x)                                 (((x) >> 10) & 0x1)
+#define   C_008214_MEC1_PIPE0_BUSY                                    0xFFFFFBFF
+#define   S_008214_MEC1_PIPE1_BUSY(x)                                 (((x) & 0x1) << 11)
+#define   G_008214_MEC1_PIPE1_BUSY(x)                                 (((x) >> 11) & 0x1)
+#define   C_008214_MEC1_PIPE1_BUSY                                    0xFFFFF7FF
+#define   S_008214_MEC1_PIPE2_BUSY(x)                                 (((x) & 0x1) << 12)
+#define   G_008214_MEC1_PIPE2_BUSY(x)                                 (((x) >> 12) & 0x1)
+#define   C_008214_MEC1_PIPE2_BUSY                                    0xFFFFEFFF
+#define   S_008214_MEC1_PIPE3_BUSY(x)                                 (((x) & 0x1) << 13)
+#define   G_008214_MEC1_PIPE3_BUSY(x)                                 (((x) >> 13) & 0x1)
+#define   C_008214_MEC1_PIPE3_BUSY                                    0xFFFFDFFF
+#define   S_008214_MEC2_LOAD_BUSY(x)                                  (((x) & 0x1) << 16)
+#define   G_008214_MEC2_LOAD_BUSY(x)                                  (((x) >> 16) & 0x1)
+#define   C_008214_MEC2_LOAD_BUSY                                     0xFFFEFFFF
+#define   S_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 17)
+#define   G_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) >> 17) & 0x1)
+#define   C_008214_MEC2_SEMAPOHRE_BUSY                                0xFFFDFFFF
+#define   S_008214_MEC2_MUTEX_BUSY(x)                                 (((x) & 0x1) << 18)
+#define   G_008214_MEC2_MUTEX_BUSY(x)                                 (((x) >> 18) & 0x1)
+#define   C_008214_MEC2_MUTEX_BUSY                                    0xFFFBFFFF
+#define   S_008214_MEC2_MESSAGE_BUSY(x)                               (((x) & 0x1) << 19)
+#define   G_008214_MEC2_MESSAGE_BUSY(x)                               (((x) >> 19) & 0x1)
+#define   C_008214_MEC2_MESSAGE_BUSY                                  0xFFF7FFFF
+#define   S_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008214_MEC2_EOP_QUEUE_BUSY                                0xFFEFFFFF
+#define   S_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 21)
+#define   G_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) >> 21) & 0x1)
+#define   C_008214_MEC2_IQ_QUEUE_BUSY                                 0xFFDFFFFF
+#define   S_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 22)
+#define   G_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) >> 22) & 0x1)
+#define   C_008214_MEC2_IB_QUEUE_BUSY                                 0xFFBFFFFF
+#define   S_008214_MEC2_TC_BUSY(x)                                    (((x) & 0x1) << 23)
+#define   G_008214_MEC2_TC_BUSY(x)                                    (((x) >> 23) & 0x1)
+#define   C_008214_MEC2_TC_BUSY                                       0xFF7FFFFF
+#define   S_008214_MEC2_DMA_BUSY(x)                                   (((x) & 0x1) << 24)
+#define   G_008214_MEC2_DMA_BUSY(x)                                   (((x) >> 24) & 0x1)
+#define   C_008214_MEC2_DMA_BUSY                                      0xFEFFFFFF
+#define   S_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 25)
+#define   G_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 25) & 0x1)
+#define   C_008214_MEC2_PARTIAL_FLUSH_BUSY                            0xFDFFFFFF
+#define   S_008214_MEC2_PIPE0_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008214_MEC2_PIPE0_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008214_MEC2_PIPE0_BUSY                                    0xFBFFFFFF
+#define   S_008214_MEC2_PIPE1_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008214_MEC2_PIPE1_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008214_MEC2_PIPE1_BUSY                                    0xF7FFFFFF
+#define   S_008214_MEC2_PIPE2_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008214_MEC2_PIPE2_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008214_MEC2_PIPE2_BUSY                                    0xEFFFFFFF
+#define   S_008214_MEC2_PIPE3_BUSY(x)                                 (((x) & 0x1) << 29)
+#define   G_008214_MEC2_PIPE3_BUSY(x)                                 (((x) >> 29) & 0x1)
+#define   C_008214_MEC2_PIPE3_BUSY                                    0xDFFFFFFF
+#define R_008218_CP_CPC_STALLED_STAT1                                   0x008218
+#define   S_008218_RCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 3)
+#define   G_008218_RCIU_TX_FREE_STALL(x)                              (((x) >> 3) & 0x1)
+#define   C_008218_RCIU_TX_FREE_STALL                                 0xFFFFFFF7
+#define   S_008218_RCIU_PRIV_VIOLATION(x)                             (((x) & 0x1) << 4)
+#define   G_008218_RCIU_PRIV_VIOLATION(x)                             (((x) >> 4) & 0x1)
+#define   C_008218_RCIU_PRIV_VIOLATION                                0xFFFFFFEF
+#define   S_008218_TCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 6)
+#define   G_008218_TCIU_TX_FREE_STALL(x)                              (((x) >> 6) & 0x1)
+#define   C_008218_TCIU_TX_FREE_STALL                                 0xFFFFFFBF
+#define   S_008218_MEC1_DECODING_PACKET(x)                            (((x) & 0x1) << 8)
+#define   G_008218_MEC1_DECODING_PACKET(x)                            (((x) >> 8) & 0x1)
+#define   C_008218_MEC1_DECODING_PACKET                               0xFFFFFEFF
+#define   S_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 9)
+#define   G_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) >> 9) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU                                  0xFFFFFDFF
+#define   S_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 10)
+#define   G_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) >> 10) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU_READ                             0xFFFFFBFF
+#define   S_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 13)
+#define   G_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) >> 13) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_ROQ_DATA                              0xFFFFDFFF
+#define   S_008218_MEC2_DECODING_PACKET(x)                            (((x) & 0x1) << 16)
+#define   G_008218_MEC2_DECODING_PACKET(x)                            (((x) >> 16) & 0x1)
+#define   C_008218_MEC2_DECODING_PACKET                               0xFFFEFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 17)
+#define   G_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) >> 17) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU                                  0xFFFDFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 18)
+#define   G_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) >> 18) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU_READ                             0xFFFBFFFF
+#define   S_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 21)
+#define   G_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) >> 21) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_ROQ_DATA                              0xFFDFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 22)
+#define   G_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 22) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_FREE                            0xFFBFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 23)
+#define   G_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 23) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_TAGS                            0xFF7FFFFF
+#define   S_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 24)
+#define   G_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 24) & 0x1)
+#define   C_008218_ATCL1_WAITING_ON_TRANS                             0xFEFFFFFF
+#define R_00821C_CP_CPF_STATUS                                          0x00821C
+#define   S_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) & 0x1) << 0)
+#define   G_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) >> 0) & 0x1)
+#define   C_00821C_POST_WPTR_GFX_BUSY                                 0xFFFFFFFE
+#define   S_00821C_CSF_BUSY(x)                                        (((x) & 0x1) << 1)
+#define   G_00821C_CSF_BUSY(x)                                        (((x) >> 1) & 0x1)
+#define   C_00821C_CSF_BUSY                                           0xFFFFFFFD
+#define   S_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_00821C_ROQ_ALIGN_BUSY                                     0xFFFFFFEF
+#define   S_00821C_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 5)
+#define   G_00821C_ROQ_RING_BUSY(x)                                   (((x) >> 5) & 0x1)
+#define   C_00821C_ROQ_RING_BUSY                                      0xFFFFFFDF
+#define   S_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_00821C_ROQ_INDIRECT1_BUSY                                 0xFFFFFFBF
+#define   S_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 7)
+#define   G_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 7) & 0x1)
+#define   C_00821C_ROQ_INDIRECT2_BUSY                                 0xFFFFFF7F
+#define   S_00821C_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_00821C_ROQ_STATE_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_00821C_ROQ_STATE_BUSY                                     0xFFFFFEFF
+#define   S_00821C_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 9)
+#define   G_00821C_ROQ_CE_RING_BUSY(x)                                (((x) >> 9) & 0x1)
+#define   C_00821C_ROQ_CE_RING_BUSY                                   0xFFFFFDFF
+#define   S_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 10)
+#define   G_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 10) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT1_BUSY                              0xFFFFFBFF
+#define   S_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 11)
+#define   G_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 11) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT2_BUSY                              0xFFFFF7FF
+#define   S_00821C_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_00821C_SEMAPHORE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_00821C_SEMAPHORE_BUSY                                     0xFFFFEFFF
+#define   S_00821C_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 13)
+#define   G_00821C_INTERRUPT_BUSY(x)                                  (((x) >> 13) & 0x1)
+#define   C_00821C_INTERRUPT_BUSY                                     0xFFFFDFFF
+#define   S_00821C_TCIU_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_00821C_TCIU_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_00821C_TCIU_BUSY                                          0xFFFFBFFF
+#define   S_00821C_HQD_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_00821C_HQD_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_00821C_HQD_BUSY                                           0xFFFF7FFF
+#define   S_00821C_PRT_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_00821C_PRT_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_00821C_PRT_BUSY                                           0xFFFEFFFF
+#define   S_00821C_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 17)
+#define   G_00821C_ATCL2IU_BUSY(x)                                    (((x) >> 17) & 0x1)
+#define   C_00821C_ATCL2IU_BUSY                                       0xFFFDFFFF
+#define   S_00821C_CPF_GFX_BUSY(x)                                    (((x) & 0x1) << 26)
+#define   G_00821C_CPF_GFX_BUSY(x)                                    (((x) >> 26) & 0x1)
+#define   C_00821C_CPF_GFX_BUSY                                       0xFBFFFFFF
+#define   S_00821C_CPF_CMP_BUSY(x)                                    (((x) & 0x1) << 27)
+#define   G_00821C_CPF_CMP_BUSY(x)                                    (((x) >> 27) & 0x1)
+#define   C_00821C_CPF_CMP_BUSY                                       0xF7FFFFFF
+#define   S_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) & 0x03) << 28)
+#define   G_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) >> 28) & 0x03)
+#define   C_00821C_GRBM_CPF_STAT_BUSY                                 0xCFFFFFFF
+#define   S_00821C_CPC_CPF_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_00821C_CPC_CPF_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_00821C_CPC_CPF_BUSY                                       0xBFFFFFFF
+#define   S_00821C_CPF_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_00821C_CPF_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_00821C_CPF_BUSY                                           0x7FFFFFFF
+#define R_008220_CP_CPF_BUSY_STAT                                       0x008220
+#define   S_008220_REG_BUS_FIFO_BUSY(x)                               (((x) & 0x1) << 0)
+#define   G_008220_REG_BUS_FIFO_BUSY(x)                               (((x) >> 0) & 0x1)
+#define   C_008220_REG_BUS_FIFO_BUSY                                  0xFFFFFFFE
+#define   S_008220_CSF_RING_BUSY(x)                                   (((x) & 0x1) << 1)
+#define   G_008220_CSF_RING_BUSY(x)                                   (((x) >> 1) & 0x1)
+#define   C_008220_CSF_RING_BUSY                                      0xFFFFFFFD
+#define   S_008220_CSF_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 2)
+#define   G_008220_CSF_INDIRECT1_BUSY(x)                              (((x) >> 2) & 0x1)
+#define   C_008220_CSF_INDIRECT1_BUSY                                 0xFFFFFFFB
+#define   S_008220_CSF_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 3)
+#define   G_008220_CSF_INDIRECT2_BUSY(x)                              (((x) >> 3) & 0x1)
+#define   C_008220_CSF_INDIRECT2_BUSY                                 0xFFFFFFF7
+#define   S_008220_CSF_STATE_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_008220_CSF_STATE_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_008220_CSF_STATE_BUSY                                     0xFFFFFFEF
+#define   S_008220_CSF_CE_INDR1_BUSY(x)                               (((x) & 0x1) << 5)
+#define   G_008220_CSF_CE_INDR1_BUSY(x)                               (((x) >> 5) & 0x1)
+#define   C_008220_CSF_CE_INDR1_BUSY                                  0xFFFFFFDF
+#define   S_008220_CSF_CE_INDR2_BUSY(x)                               (((x) & 0x1) << 6)
+#define   G_008220_CSF_CE_INDR2_BUSY(x)                               (((x) >> 6) & 0x1)
+#define   C_008220_CSF_CE_INDR2_BUSY                                  0xFFFFFFBF
+#define   S_008220_CSF_ARBITER_BUSY(x)                                (((x) & 0x1) << 7)
+#define   G_008220_CSF_ARBITER_BUSY(x)                                (((x) >> 7) & 0x1)
+#define   C_008220_CSF_ARBITER_BUSY                                   0xFFFFFF7F
+#define   S_008220_CSF_INPUT_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_008220_CSF_INPUT_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_008220_CSF_INPUT_BUSY                                     0xFFFFFEFF
+#define   S_008220_OUTSTANDING_READ_TAGS(x)                           (((x) & 0x1) << 9)
+#define   G_008220_OUTSTANDING_READ_TAGS(x)                           (((x) >> 9) & 0x1)
+#define   C_008220_OUTSTANDING_READ_TAGS                              0xFFFFFDFF
+#define   S_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) & 0x1) << 11)
+#define   G_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) >> 11) & 0x1)
+#define   C_008220_HPD_PROCESSING_EOP_BUSY                            0xFFFFF7FF
+#define   S_008220_HQD_DISPATCH_BUSY(x)                               (((x) & 0x1) << 12)
+#define   G_008220_HQD_DISPATCH_BUSY(x)                               (((x) >> 12) & 0x1)
+#define   C_008220_HQD_DISPATCH_BUSY                                  0xFFFFEFFF
+#define   S_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) & 0x1) << 13)
+#define   G_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) >> 13) & 0x1)
+#define   C_008220_HQD_IQ_TIMER_BUSY                                  0xFFFFDFFF
+#define   S_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) & 0x1) << 14)
+#define   G_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) >> 14) & 0x1)
+#define   C_008220_HQD_DMA_OFFLOAD_BUSY                               0xFFFFBFFF
+#define   S_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) & 0x1) << 15)
+#define   G_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) >> 15) & 0x1)
+#define   C_008220_HQD_WAIT_SEMAPHORE_BUSY                            0xFFFF7FFF
+#define   S_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) & 0x1) << 16)
+#define   G_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) >> 16) & 0x1)
+#define   C_008220_HQD_SIGNAL_SEMAPHORE_BUSY                          0xFFFEFFFF
+#define   S_008220_HQD_MESSAGE_BUSY(x)                                (((x) & 0x1) << 17)
+#define   G_008220_HQD_MESSAGE_BUSY(x)                                (((x) >> 17) & 0x1)
+#define   C_008220_HQD_MESSAGE_BUSY                                   0xFFFDFFFF
+#define   S_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 18)
+#define   G_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) >> 18) & 0x1)
+#define   C_008220_HQD_PQ_FETCHER_BUSY                                0xFFFBFFFF
+#define   S_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) & 0x1) << 19)
+#define   G_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) >> 19) & 0x1)
+#define   C_008220_HQD_IB_FETCHER_BUSY                                0xFFF7FFFF
+#define   S_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008220_HQD_IQ_FETCHER_BUSY                                0xFFEFFFFF
+#define   S_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) & 0x1) << 21)
+#define   G_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) >> 21) & 0x1)
+#define   C_008220_HQD_EOP_FETCHER_BUSY                               0xFFDFFFFF
+#define   S_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) & 0x1) << 22)
+#define   G_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) >> 22) & 0x1)
+#define   C_008220_HQD_CONSUMED_RPTR_BUSY                             0xFFBFFFFF
+#define   S_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) & 0x1) << 23)
+#define   G_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) >> 23) & 0x1)
+#define   C_008220_HQD_FETCHER_ARB_BUSY                               0xFF7FFFFF
+#define   S_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) & 0x1) << 24)
+#define   G_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) >> 24) & 0x1)
+#define   C_008220_HQD_ROQ_ALIGN_BUSY                                 0xFEFFFFFF
+#define   S_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) & 0x1) << 25)
+#define   G_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) >> 25) & 0x1)
+#define   C_008220_HQD_ROQ_EOP_BUSY                                   0xFDFFFFFF
+#define   S_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008220_HQD_ROQ_IQ_BUSY                                    0xFBFFFFFF
+#define   S_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008220_HQD_ROQ_PQ_BUSY                                    0xF7FFFFFF
+#define   S_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008220_HQD_ROQ_IB_BUSY                                    0xEFFFFFFF
+#define   S_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) & 0x1) << 29)
+#define   G_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) >> 29) & 0x1)
+#define   C_008220_HQD_WPTR_POLL_BUSY                                 0xDFFFFFFF
+#define   S_008220_HQD_PQ_BUSY(x)                                     (((x) & 0x1) << 30)
+#define   G_008220_HQD_PQ_BUSY(x)                                     (((x) >> 30) & 0x1)
+#define   C_008220_HQD_PQ_BUSY                                        0xBFFFFFFF
+#define   S_008220_HQD_IB_BUSY(x)                                     (((x) & 0x1) << 31)
+#define   G_008220_HQD_IB_BUSY(x)                                     (((x) >> 31) & 0x1)
+#define   C_008220_HQD_IB_BUSY                                        0x7FFFFFFF
+#define R_008224_CP_CPF_STALLED_STAT1                                   0x008224
+#define   S_008224_RING_FETCHING_DATA(x)                              (((x) & 0x1) << 0)
+#define   G_008224_RING_FETCHING_DATA(x)                              (((x) >> 0) & 0x1)
+#define   C_008224_RING_FETCHING_DATA                                 0xFFFFFFFE
+#define   S_008224_INDR1_FETCHING_DATA(x)                             (((x) & 0x1) << 1)
+#define   G_008224_INDR1_FETCHING_DATA(x)                             (((x) >> 1) & 0x1)
+#define   C_008224_INDR1_FETCHING_DATA                                0xFFFFFFFD
+#define   S_008224_INDR2_FETCHING_DATA(x)                             (((x) & 0x1) << 2)
+#define   G_008224_INDR2_FETCHING_DATA(x)                             (((x) >> 2) & 0x1)
+#define   C_008224_INDR2_FETCHING_DATA                                0xFFFFFFFB
+#define   S_008224_STATE_FETCHING_DATA(x)                             (((x) & 0x1) << 3)
+#define   G_008224_STATE_FETCHING_DATA(x)                             (((x) >> 3) & 0x1)
+#define   C_008224_STATE_FETCHING_DATA                                0xFFFFFFF7
+#define   S_008224_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 5)
+#define   G_008224_TCIU_WAITING_ON_FREE(x)                            (((x) >> 5) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_FREE                               0xFFFFFFDF
+#define   S_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 6)
+#define   G_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 6) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_TAGS                               0xFFFFFFBF
+#define   S_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 7)
+#define   G_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 7) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_FREE                            0xFFFFFF7F
+#define   S_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 8)
+#define   G_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 8) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_TAGS                            0xFFFFFEFF
+#define   S_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 9)
+#define   G_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 9) & 0x1)
+#define   C_008224_ATCL1_WAITING_ON_TRANS                             0xFFFFFDFF
 #define R_030230_CP_COHER_SIZE_HI                                       0x030230
 #define   S_030230_COHER_SIZE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_030230_COHER_SIZE_HI_256B(x)                              (((x) >> 0) & 0xFF)
@@ -375,10 +1299,6 @@
 #define   C_0088C4_ES_LIMIT                                           0xFFE0FFFF
 #define R_0088C8_VGT_ESGS_RING_SIZE                                     0x0088C8
 #define R_0088CC_VGT_GSVS_RING_SIZE                                     0x0088CC
-/* CIK */
-#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
-#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
-/*     */
 #define R_0088D4_VGT_GS_VERTEX_REUSE                                    0x0088D4
 #define   S_0088D4_VERT_REUSE(x)                                      (((x) & 0x1F) << 0)
 #define   G_0088D4_VERT_REUSE(x)                                      (((x) >> 0) & 0x1F)
@@ -461,7 +1381,293 @@
 #define   S_008B10_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_008B10_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_008B10_CURRENT_COUNT                                      0xFFFF00FF
+#define R_008670_CP_STALLED_STAT3                                       0x008670
+#define   S_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 0)
+#define   G_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) >> 0) & 0x1)
+#define   C_008670_CE_TO_CSF_NOT_RDY_TO_RCV                           0xFFFFFFFE
+#define   S_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) & 0x1) << 1)
+#define   G_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) >> 1) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV              0xFFFFFFFD
+#define   S_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) & 0x1) << 2)
+#define   G_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) >> 2) & 0x1)
+#define   C_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER           0xFFFFFFFB
+#define   S_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) & 0x1) << 3)
+#define   G_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) >> 3) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_NOT_RDY                             0xFFFFFFF7
+#define   S_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) & 0x1) << 4)
+#define   G_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) >> 4) & 0x1)
+#define   C_008670_CE_TO_RAM_DUMP_NOT_RDY                             0xFFFFFFEF
+#define   S_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) & 0x1) << 5)
+#define   G_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) >> 5) & 0x1)
+#define   C_008670_CE_TO_RAM_WRITE_NOT_RDY                            0xFFFFFFDF
+#define   S_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) & 0x1) << 6)
+#define   G_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) >> 6) & 0x1)
+#define   C_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV                      0xFFFFFFBF
+#define   S_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 7)
+#define   G_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) >> 7) & 0x1)
+#define   C_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV                       0xFFFFFF7F
+#define   S_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) & 0x1) << 10)
+#define   G_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) >> 10) & 0x1)
+#define   C_008670_CE_WAITING_ON_BUFFER_DATA                          0xFFFFFBFF
+#define   S_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008670_CE_WAITING_ON_CE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) & 0x1) << 12)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) >> 12) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER                           0xFFFFEFFF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) & 0x1) << 13)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) >> 13) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW                 0xFFFFDFFF
+#define   S_008670_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 14)
+#define   G_008670_TCIU_WAITING_ON_FREE(x)                            (((x) >> 14) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_FREE                               0xFFFFBFFF
+#define   S_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 15)
+#define   G_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 15) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_TAGS                               0xFFFF7FFF
+#define   S_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 16)
+#define   G_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 16) & 0x1)
+#define   C_008670_CE_STALLED_ON_TC_WR_CONFIRM                        0xFFFEFFFF
+#define   S_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 17)
+#define   G_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 17) & 0x1)
+#define   C_008670_CE_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFDFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 18)
+#define   G_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 18) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_FREE                            0xFFFBFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 19)
+#define   G_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 19) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_TAGS                            0xFFF7FFFF
+#define   S_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 20)
+#define   G_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 20) & 0x1)
+#define   C_008670_ATCL1_WAITING_ON_TRANS                             0xFFEFFFFF
+#define R_008674_CP_STALLED_STAT1                                       0x008674
+#define   S_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 0)
+#define   G_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) >> 0) & 0x1)
+#define   C_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV                         0xFFFFFFFE
+#define   S_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 4)
+#define   G_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) >> 4) & 0x1)
+#define   C_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV                       0xFFFFFFEF
+#define   S_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 10)
+#define   G_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) >> 10) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG                       0xFFFFFBFF
+#define   S_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 12)
+#define   G_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 12) & 0x1)
+#define   C_008674_ME_STALLED_ON_TC_WR_CONFIRM                        0xFFFFEFFF
+#define   S_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 13)
+#define   G_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 13) & 0x1)
+#define   C_008674_ME_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFFDFFF
+#define   S_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) & 0x1) << 14)
+#define   G_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) >> 14) & 0x1)
+#define   C_008674_ME_WAITING_ON_TC_READ_DATA                         0xFFFFBFFF
+#define   S_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) & 0x1) << 15)
+#define   G_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) >> 15) & 0x1)
+#define   C_008674_ME_WAITING_ON_REG_READ_DATA                        0xFFFF7FFF
+#define   S_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) & 0x1) << 23)
+#define   G_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) >> 23) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GDS_FREE                           0xFF7FFFFF
+#define   S_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) & 0x1) << 24)
+#define   G_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) >> 24) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GRBM_FREE                          0xFEFFFFFF
+#define   S_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) & 0x1) << 25)
+#define   G_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) >> 25) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_VGT_FREE                           0xFDFFFFFF
+#define   S_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) & 0x1) << 26)
+#define   G_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) >> 26) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_ME_READ                            0xFBFFFFFF
+#define   S_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) & 0x1) << 27)
+#define   G_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) >> 27) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_DMA_READ                           0xF7FFFFFF
+#define   S_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) & 0x1) << 28)
+#define   G_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) >> 28) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_APPEND_READ                        0xEFFFFFFF
+#define   S_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) & 0x1) << 29)
+#define   G_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) >> 29) & 0x1)
+#define   C_008674_RCIU_HALTED_BY_REG_VIOLATION                       0xDFFFFFFF
+#define R_008678_CP_STALLED_STAT2                                       0x008678
+#define   S_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 0)
+#define   G_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) >> 0) & 0x1)
+#define   C_008678_PFP_TO_CSF_NOT_RDY_TO_RCV                          0xFFFFFFFE
+#define   S_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 1)
+#define   G_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) >> 1) & 0x1)
+#define   C_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV                          0xFFFFFFFD
+#define   S_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) & 0x1) << 4)
+#define   G_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) >> 4) & 0x1)
+#define   C_008678_PFP_TO_VGT_WRITES_PENDING                          0xFFFFFFEF
+#define   S_008678_PFP_RCIU_READ_PENDING(x)                           (((x) & 0x1) << 5)
+#define   G_008678_PFP_RCIU_READ_PENDING(x)                           (((x) >> 5) & 0x1)
+#define   C_008678_PFP_RCIU_READ_PENDING                              0xFFFFFFDF
+#define   S_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) & 0x1) << 8)
+#define   G_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) >> 8) & 0x1)
+#define   C_008678_PFP_WAITING_ON_BUFFER_DATA                         0xFFFFFEFF
+#define   S_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) & 0x1) << 9)
+#define   G_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) >> 9) & 0x1)
+#define   C_008678_ME_WAIT_ON_CE_COUNTER                              0xFFFFFDFF
+#define   S_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) & 0x1) << 10)
+#define   G_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) >> 10) & 0x1)
+#define   C_008678_ME_WAIT_ON_AVAIL_BUFFER                            0xFFFFFBFF
+#define   S_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) & 0x1) << 11)
+#define   G_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) >> 11) & 0x1)
+#define   C_008678_GFX_CNTX_NOT_AVAIL_TO_ME                           0xFFFFF7FF
+#define   S_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) & 0x1) << 12)
+#define   G_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) >> 12) & 0x1)
+#define   C_008678_ME_RCIU_NOT_RDY_TO_RCV                             0xFFFFEFFF
+#define   S_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 13)
+#define   G_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) >> 13) & 0x1)
+#define   C_008678_ME_TO_CONST_NOT_RDY_TO_RCV                         0xFFFFDFFF
+#define   S_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) & 0x1) << 14)
+#define   G_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) >> 14) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_PFP                           0xFFFFBFFF
+#define   S_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) & 0x1) << 15)
+#define   G_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) >> 15) & 0x1)
+#define   C_008678_ME_WAITING_ON_PARTIAL_FLUSH                        0xFFFF7FFF
+#define   S_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 16)
+#define   G_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 16) & 0x1)
+#define   C_008678_MEQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFEFFFF
+#define   S_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 17)
+#define   G_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 17) & 0x1)
+#define   C_008678_STQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFDFFFF
+#define   S_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) & 0x1) << 18)
+#define   G_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) >> 18) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_STQ                           0xFFFBFFFF
+#define   S_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) & 0x1) << 19)
+#define   G_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) >> 19) & 0x1)
+#define   C_008678_PFP_STALLED_ON_TC_WR_CONFIRM                       0xFFF7FFFF
+#define   S_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) & 0x1) << 20)
+#define   G_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) >> 20) & 0x1)
+#define   C_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA                     0xFFEFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) & 0x1) << 21)
+#define   G_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) >> 21) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE                        0xFFDFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) & 0x1) << 22)
+#define   G_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) >> 22) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_WR_CONFIRM                         0xFFBFFFFF
+#define   S_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) & 0x1) << 23)
+#define   G_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) >> 23) & 0x1)
+#define   C_008678_STRMO_WR_OF_PRIM_DATA_PENDING                      0xFF7FFFFF
+#define   S_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) & 0x1) << 24)
+#define   G_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) >> 24) & 0x1)
+#define   C_008678_PIPE_STATS_WR_DATA_PENDING                         0xFEFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) & 0x1) << 25)
+#define   G_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) >> 25) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_CS_DONE                         0xFDFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) & 0x1) << 26)
+#define   G_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) >> 26) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_PS_DONE                         0xFBFFFFFF
+#define   S_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) & 0x1) << 27)
+#define   G_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) >> 27) & 0x1)
+#define   C_008678_APPEND_WAIT_ON_WR_CONFIRM                          0xF7FFFFFF
+#define   S_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) & 0x1) << 28)
+#define   G_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) >> 28) & 0x1)
+#define   C_008678_APPEND_ACTIVE_PARTITION                            0xEFFFFFFF
+#define   S_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) & 0x1) << 29)
+#define   G_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) >> 29) & 0x1)
+#define   C_008678_APPEND_WAITING_TO_SEND_MEMWRITE                    0xDFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) & 0x1) << 30)
+#define   G_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) >> 30) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_IDLE_CNTXS                         0xBFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) & 0x1) << 31)
+#define   G_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) >> 31) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_ALL_CLEAN                          0x7FFFFFFF
+#define R_008680_CP_STAT                                                0x008680
+#define   S_008680_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 9)
+#define   G_008680_ROQ_RING_BUSY(x)                                   (((x) >> 9) & 0x1)
+#define   C_008680_ROQ_RING_BUSY                                      0xFFFFFDFF
+#define   S_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 10)
+#define   G_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 10) & 0x1)
+#define   C_008680_ROQ_INDIRECT1_BUSY                                 0xFFFFFBFF
+#define   S_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 11)
+#define   G_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 11) & 0x1)
+#define   C_008680_ROQ_INDIRECT2_BUSY                                 0xFFFFF7FF
+#define   S_008680_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_008680_ROQ_STATE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_008680_ROQ_STATE_BUSY                                     0xFFFFEFFF
+#define   S_008680_DC_BUSY(x)                                         (((x) & 0x1) << 13)
+#define   G_008680_DC_BUSY(x)                                         (((x) >> 13) & 0x1)
+#define   C_008680_DC_BUSY                                            0xFFFFDFFF
+#define   S_008680_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 14)
+#define   G_008680_ATCL2IU_BUSY(x)                                    (((x) >> 14) & 0x1)
+#define   C_008680_ATCL2IU_BUSY                                       0xFFFFBFFF
+#define   S_008680_PFP_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008680_PFP_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008680_PFP_BUSY                                           0xFFFF7FFF
+#define   S_008680_MEQ_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_008680_MEQ_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_008680_MEQ_BUSY                                           0xFFFEFFFF
+#define   S_008680_ME_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_008680_ME_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_008680_ME_BUSY                                            0xFFFDFFFF
+#define   S_008680_QUERY_BUSY(x)                                      (((x) & 0x1) << 18)
+#define   G_008680_QUERY_BUSY(x)                                      (((x) >> 18) & 0x1)
+#define   C_008680_QUERY_BUSY                                         0xFFFBFFFF
+#define   S_008680_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 19)
+#define   G_008680_SEMAPHORE_BUSY(x)                                  (((x) >> 19) & 0x1)
+#define   C_008680_SEMAPHORE_BUSY                                     0xFFF7FFFF
+#define   S_008680_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 20)
+#define   G_008680_INTERRUPT_BUSY(x)                                  (((x) >> 20) & 0x1)
+#define   C_008680_INTERRUPT_BUSY                                     0xFFEFFFFF
+#define   S_008680_SURFACE_SYNC_BUSY(x)                               (((x) & 0x1) << 21)
+#define   G_008680_SURFACE_SYNC_BUSY(x)                               (((x) >> 21) & 0x1)
+#define   C_008680_SURFACE_SYNC_BUSY                                  0xFFDFFFFF
+#define   S_008680_DMA_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008680_DMA_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008680_DMA_BUSY                                           0xFFBFFFFF
+#define   S_008680_RCIU_BUSY(x)                                       (((x) & 0x1) << 23)
+#define   G_008680_RCIU_BUSY(x)                                       (((x) >> 23) & 0x1)
+#define   C_008680_RCIU_BUSY                                          0xFF7FFFFF
+#define   S_008680_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 24)
+#define   G_008680_SCRATCH_RAM_BUSY(x)                                (((x) >> 24) & 0x1)
+#define   C_008680_SCRATCH_RAM_BUSY                                   0xFEFFFFFF
+#define   S_008680_CPC_CPG_BUSY(x)                                    (((x) & 0x1) << 25)
+#define   G_008680_CPC_CPG_BUSY(x)                                    (((x) >> 25) & 0x1)
+#define   C_008680_CPC_CPG_BUSY                                       0xFDFFFFFF
+#define   S_008680_CE_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008680_CE_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008680_CE_BUSY                                            0xFBFFFFFF
+#define   S_008680_TCIU_BUSY(x)                                       (((x) & 0x1) << 27)
+#define   G_008680_TCIU_BUSY(x)                                       (((x) >> 27) & 0x1)
+#define   C_008680_TCIU_BUSY                                          0xF7FFFFFF
+#define   S_008680_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 28)
+#define   G_008680_ROQ_CE_RING_BUSY(x)                                (((x) >> 28) & 0x1)
+#define   C_008680_ROQ_CE_RING_BUSY                                   0xEFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 29)
+#define   G_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 29) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT1_BUSY                              0xDFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 30)
+#define   G_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 30) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT2_BUSY                              0xBFFFFFFF
+#define   S_008680_CP_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008680_CP_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008680_CP_BUSY                                            0x7FFFFFFF
 /* CIK */
+#define R_030800_GRBM_GFX_INDEX                                         0x030800
+#define   S_030800_INSTANCE_INDEX(x)                                  (((x) & 0xFF) << 0)
+#define   G_030800_INSTANCE_INDEX(x)                                  (((x) >> 0) & 0xFF)
+#define   C_030800_INSTANCE_INDEX                                     0xFFFFFF00
+#define   S_030800_SH_INDEX(x)                                        (((x) & 0xFF) << 8)
+#define   G_030800_SH_INDEX(x)                                        (((x) >> 8) & 0xFF)
+#define   C_030800_SH_INDEX                                           0xFFFF00FF
+#define   S_030800_SE_INDEX(x)                                        (((x) & 0xFF) << 16)
+#define   G_030800_SE_INDEX(x)                                        (((x) >> 16) & 0xFF)
+#define   C_030800_SE_INDEX                                           0xFF00FFFF
+#define   S_030800_SH_BROADCAST_WRITES(x)                             (((x) & 0x1) << 29)
+#define   G_030800_SH_BROADCAST_WRITES(x)                             (((x) >> 29) & 0x1)
+#define   C_030800_SH_BROADCAST_WRITES                                0xDFFFFFFF
+#define   S_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) & 0x1) << 30)
+#define   G_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) >> 30) & 0x1)
+#define   C_030800_INSTANCE_BROADCAST_WRITES                          0xBFFFFFFF
+#define   S_030800_SE_BROADCAST_WRITES(x)                             (((x) & 0x1) << 31)
+#define   G_030800_SE_BROADCAST_WRITES(x)                             (((x) >> 31) & 0x1)
+#define   C_030800_SE_BROADCAST_WRITES                                0x7FFFFFFF
+#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
+#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
 #define R_030908_VGT_PRIMITIVE_TYPE                                     0x030908
 #define   S_030908_PRIM_TYPE(x)                                       (((x) & 0x3F) << 0)
 #define   G_030908_PRIM_TYPE(x)                                       (((x) >> 0) & 0x3F)
@@ -530,6 +1736,34 @@
 #define   S_030A04_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_030A04_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_030A04_CURRENT_COUNT                                      0xFFFF00FF
+#define R_030A10_PA_SC_SCREEN_EXTENT_MIN_0                              0x030A10
+#define   S_030A10_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A10_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A10_X                                                  0xFFFF0000
+#define   S_030A10_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A10_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A10_Y                                                  0x0000FFFF
+#define R_030A14_PA_SC_SCREEN_EXTENT_MAX_0                              0x030A14
+#define   S_030A14_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A14_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A14_X                                                  0xFFFF0000
+#define   S_030A14_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A14_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A14_Y                                                  0x0000FFFF
+#define R_030A18_PA_SC_SCREEN_EXTENT_MIN_1                              0x030A18
+#define   S_030A18_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A18_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A18_X                                                  0xFFFF0000
+#define   S_030A18_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A18_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A18_Y                                                  0x0000FFFF
+#define R_030A2C_PA_SC_SCREEN_EXTENT_MAX_1                              0x030A2C
+#define   S_030A2C_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A2C_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A2C_X                                                  0xFFFF0000
+#define   S_030A2C_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A2C_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A2C_Y                                                  0x0000FFFF
 /*     */
 #define R_008BF0_PA_SC_ENHANCE                                          0x008BF0
 #define   S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x)                       (((x) & 0x1) << 0)
@@ -608,6 +1842,32 @@
 #define     V_008DFC_SQ_VGPR                                        0x00
 /*     */
 #define R_008DFC_SQ_INST                                                0x008DFC
+#define R_030D20_SQC_CACHES                                             0x030D20
+#define   S_030D20_TARGET_INST(x)                                     (((x) & 0x1) << 0)
+#define   G_030D20_TARGET_INST(x)                                     (((x) >> 0) & 0x1)
+#define   C_030D20_TARGET_INST                                        0xFFFFFFFE
+#define   S_030D20_TARGET_DATA(x)                                     (((x) & 0x1) << 1)
+#define   G_030D20_TARGET_DATA(x)                                     (((x) >> 1) & 0x1)
+#define   C_030D20_TARGET_DATA                                        0xFFFFFFFD
+#define   S_030D20_INVALIDATE(x)                                      (((x) & 0x1) << 2)
+#define   G_030D20_INVALIDATE(x)                                      (((x) >> 2) & 0x1)
+#define   C_030D20_INVALIDATE                                         0xFFFFFFFB
+#define   S_030D20_WRITEBACK(x)                                       (((x) & 0x1) << 3)
+#define   G_030D20_WRITEBACK(x)                                       (((x) >> 3) & 0x1)
+#define   C_030D20_WRITEBACK                                          0xFFFFFFF7
+#define   S_030D20_VOL(x)                                             (((x) & 0x1) << 4)
+#define   G_030D20_VOL(x)                                             (((x) >> 4) & 0x1)
+#define   C_030D20_VOL                                                0xFFFFFFEF
+#define   S_030D20_COMPLETE(x)                                        (((x) & 0x1) << 16)
+#define   G_030D20_COMPLETE(x)                                        (((x) >> 16) & 0x1)
+#define   C_030D20_COMPLETE                                           0xFFFEFFFF
+#define R_030D24_SQC_WRITEBACK                                          0x030D24
+#define   S_030D24_DWB(x)                                             (((x) & 0x1) << 0)
+#define   G_030D24_DWB(x)                                             (((x) >> 0) & 0x1)
+#define   C_030D24_DWB                                                0xFFFFFFFE
+#define   S_030D24_DIRTY(x)                                           (((x) & 0x1) << 1)
+#define   G_030D24_DIRTY(x)                                           (((x) >> 1) & 0x1)
+#define   C_030D24_DIRTY                                              0xFFFFFFFD
 #define R_008DFC_SQ_VOP1                                                0x008DFC
 #define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
 #define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
@@ -3740,7 +5000,17 @@
 #define   C_008DFC_ENCODING                                           0x03FFFFFF
 #define     V_008DFC_SQ_ENC_MUBUF_FIELD                             0x38
 #endif
+#define R_030E00_TA_CS_BC_BASE_ADDR                                     0x030E00
+#define R_030E04_TA_CS_BC_BASE_ADDR_HI                                  0x030E04
+#define   S_030E04_ADDRESS(x)                                         (((x) & 0xFF) << 0)
+#define   G_030E04_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
+#define   C_030E04_ADDRESS                                            0xFFFFFF00
+#define R_030F00_DB_OCCLUSION_COUNT0_LOW                                0x030F00
 #define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
+#define R_030F04_DB_OCCLUSION_COUNT0_HI                                 0x030F04
+#define   S_030F04_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F04_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F04_COUNT_HI                                           0x80000000
 #define R_008F04_SQ_BUF_RSRC_WORD1                                      0x008F04
 #define   S_008F04_BASE_ADDRESS_HI(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_008F04_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFFFF)
@@ -3754,7 +5024,12 @@
 #define   S_008F04_SWIZZLE_ENABLE(x)                                  (((x) & 0x1) << 31)
 #define   G_008F04_SWIZZLE_ENABLE(x)                                  (((x) >> 31) & 0x1)
 #define   C_008F04_SWIZZLE_ENABLE                                     0x7FFFFFFF
+#define R_030F08_DB_OCCLUSION_COUNT1_LOW                                0x030F08
 #define R_008F08_SQ_BUF_RSRC_WORD2                                      0x008F08
+#define R_030F0C_DB_OCCLUSION_COUNT1_HI                                 0x030F0C
+#define   S_030F0C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F0C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F0C_COUNT_HI                                           0x80000000
 #define R_008F0C_SQ_BUF_RSRC_WORD3                                      0x008F0C
 #define   S_008F0C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F0C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -3862,7 +5137,12 @@
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_1                             0x01
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_2                             0x02
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_3                             0x03
+#define R_030F10_DB_OCCLUSION_COUNT2_LOW                                0x030F10
 #define R_008F10_SQ_IMG_RSRC_WORD0                                      0x008F10
+#define R_030F14_DB_OCCLUSION_COUNT2_HI                                 0x030F14
+#define   S_030F14_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F14_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F14_COUNT_HI                                           0x80000000
 #define R_008F14_SQ_IMG_RSRC_WORD1                                      0x008F14
 #define   S_008F14_BASE_ADDRESS_HI(x)                                 (((x) & 0xFF) << 0)
 #define   G_008F14_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFF)
@@ -3961,6 +5241,7 @@
 #define   G_008F14_MTYPE(x)                                           (((x) >> 30) & 0x03)
 #define   C_008F14_MTYPE                                              0x3FFFFFFF
 /*     */
+#define R_030F18_DB_OCCLUSION_COUNT3_LOW                                0x030F18
 #define R_008F18_SQ_IMG_RSRC_WORD2                                      0x008F18
 #define   S_008F18_WIDTH(x)                                           (((x) & 0x3FFF) << 0)
 #define   G_008F18_WIDTH(x)                                           (((x) >> 0) & 0x3FFF)
@@ -3974,6 +5255,10 @@
 #define   S_008F18_INTERLACED(x)                                      (((x) & 0x1) << 31)
 #define   G_008F18_INTERLACED(x)                                      (((x) >> 31) & 0x1)
 #define   C_008F18_INTERLACED                                         0x7FFFFFFF
+#define R_030F1C_DB_OCCLUSION_COUNT3_HI                                 0x030F1C
+#define   S_030F1C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F1C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F1C_COUNT_HI                                           0x80000000
 #define R_008F1C_SQ_IMG_RSRC_WORD3                                      0x008F1C
 #define   S_008F1C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F1C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -4084,6 +5369,23 @@
 #define   G_008F28_LOD_HDW_CNT_EN(x)                                  (((x) >> 20) & 0x1)
 #define   C_008F28_LOD_HDW_CNT_EN                                     0xFFEFFFFF
 /*     */
+/* VI */
+#define   S_008F28_COMPRESSION_EN(x)                                  (((x) & 0x1) << 21)
+#define   G_008F28_COMPRESSION_EN(x)                                  (((x) >> 21) & 0x1)
+#define   C_008F28_COMPRESSION_EN                                     0xFFDFFFFF
+#define   S_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) & 0x1) << 22)
+#define   G_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) >> 22) & 0x1)
+#define   C_008F28_ALPHA_IS_ON_MSB                                    0xFFBFFFFF
+#define   S_008F28_COLOR_TRANSFORM(x)                                 (((x) & 0x1) << 23)
+#define   G_008F28_COLOR_TRANSFORM(x)                                 (((x) >> 23) & 0x1)
+#define   C_008F28_COLOR_TRANSFORM                                    0xFF7FFFFF
+#define   S_008F28_LOST_ALPHA_BITS(x)                                 (((x) & 0x0F) << 24)
+#define   G_008F28_LOST_ALPHA_BITS(x)                                 (((x) >> 24) & 0x0F)
+#define   C_008F28_LOST_ALPHA_BITS                                    0xF0FFFFFF
+#define   S_008F28_LOST_COLOR_BITS(x)                                 (((x) & 0x0F) << 28)
+#define   G_008F28_LOST_COLOR_BITS(x)                                 (((x) >> 28) & 0x0F)
+#define   C_008F28_LOST_COLOR_BITS                                    0x0FFFFFFF
+/*    */
 #define R_008F2C_SQ_IMG_RSRC_WORD7                                      0x008F2C
 #define R_008F30_SQ_IMG_SAMP_WORD0                                      0x008F30
 #define   S_008F30_CLAMP_X(x)                                         (((x) & 0x07) << 0)
@@ -4148,6 +5450,11 @@
 #define   S_008F30_FILTER_MODE(x)                                     (((x) & 0x03) << 29)
 #define   G_008F30_FILTER_MODE(x)                                     (((x) >> 29) & 0x03)
 #define   C_008F30_FILTER_MODE                                        0x9FFFFFFF
+/* VI */
+#define   S_008F30_COMPAT_MODE(x)                                     (((x) & 0x1) << 31)
+#define   G_008F30_COMPAT_MODE(x)                                     (((x) >> 31) & 0x1)
+#define   C_008F30_COMPAT_MODE                                        0x7FFFFFFF
+/*    */
 #define R_008F34_SQ_IMG_SAMP_WORD1                                      0x008F34
 #define   S_008F34_MIN_LOD(x)                                         (((x) & 0xFFF) << 0)
 #define   G_008F34_MIN_LOD(x)                                         (((x) >> 0) & 0xFFF)
@@ -4313,6 +5620,11 @@
 #define   G_008F44_OFFSET(x)                                          (((x) >> 0) & 0xFFFFFF)
 #define   C_008F44_OFFSET                                             0xFF000000
 /*     */
+#define R_030FF8_DB_ZPASS_COUNT_LOW                                     0x030FF8
+#define R_030FFC_DB_ZPASS_COUNT_HI                                      0x030FFC
+#define   S_030FFC_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030FFC_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030FFC_COUNT_HI                                           0x80000000
 #define R_009100_SPI_CONFIG_CNTL                                        0x009100
 #define   S_009100_GPR_WRITE_PRIORITY(x)                              (((x) & 0x1FFFFF) << 0)
 #define   G_009100_GPR_WRITE_PRIORITY(x)                              (((x) >> 0) & 0x1FFFFF)
@@ -4437,6 +5749,34 @@
 #define   S_009858_MSAA16_Y(x)                                        (((x) & 0x03) << 18)
 #define   G_009858_MSAA16_Y(x)                                        (((x) >> 18) & 0x03)
 #define   C_009858_MSAA16_Y                                           0xFFF3FFFF
+#define R_0098F8_GB_ADDR_CONFIG                                         0x0098F8
+#define   S_0098F8_NUM_PIPES(x)                                       (((x) & 0x07) << 0)
+#define   G_0098F8_NUM_PIPES(x)                                       (((x) >> 0) & 0x07)
+#define   C_0098F8_NUM_PIPES                                          0xFFFFFFF8
+#define   S_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 4)
+#define   G_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) >> 4) & 0x07)
+#define   C_0098F8_PIPE_INTERLEAVE_SIZE                               0xFFFFFF8F
+#define   S_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 8)
+#define   G_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) >> 8) & 0x07)
+#define   C_0098F8_BANK_INTERLEAVE_SIZE                               0xFFFFF8FF
+#define   S_0098F8_NUM_SHADER_ENGINES(x)                              (((x) & 0x03) << 12)
+#define   G_0098F8_NUM_SHADER_ENGINES(x)                              (((x) >> 12) & 0x03)
+#define   C_0098F8_NUM_SHADER_ENGINES                                 0xFFFFCFFF
+#define   S_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) & 0x07) << 16)
+#define   G_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) >> 16) & 0x07)
+#define   C_0098F8_SHADER_ENGINE_TILE_SIZE                            0xFFF8FFFF
+#define   S_0098F8_NUM_GPUS(x)                                        (((x) & 0x07) << 20)
+#define   G_0098F8_NUM_GPUS(x)                                        (((x) >> 20) & 0x07)
+#define   C_0098F8_NUM_GPUS                                           0xFF8FFFFF
+#define   S_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) & 0x03) << 24)
+#define   G_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) >> 24) & 0x03)
+#define   C_0098F8_MULTI_GPU_TILE_SIZE                                0xFCFFFFFF
+#define   S_0098F8_ROW_SIZE(x)                                        (((x) & 0x03) << 28)
+#define   G_0098F8_ROW_SIZE(x)                                        (((x) >> 28) & 0x03)
+#define   C_0098F8_ROW_SIZE                                           0xCFFFFFFF
+#define   S_0098F8_NUM_LOWER_PIPES(x)                                 (((x) & 0x1) << 30)
+#define   G_0098F8_NUM_LOWER_PIPES(x)                                 (((x) >> 30) & 0x1)
+#define   C_0098F8_NUM_LOWER_PIPES                                    0xBFFFFFFF
 #define R_009910_GB_TILE_MODE0                                          0x009910
 #define   S_009910_MICRO_TILE_MODE(x)                                 (((x) & 0x03) << 0)
 #define   G_009910_MICRO_TILE_MODE(x)                                 (((x) >> 0) & 0x03)
@@ -4515,14 +5855,88 @@
 #define     V_009910_ADDR_SURF_4_BANK                               0x01
 #define     V_009910_ADDR_SURF_8_BANK                               0x02
 #define     V_009910_ADDR_SURF_16_BANK                              0x03
-/* CIK */
 #define   S_009910_MICRO_TILE_MODE_NEW(x)                             (((x) & 0x07) << 22)
 #define   G_009910_MICRO_TILE_MODE_NEW(x)                             (((x) >> 22) & 0x07)
-#define   C_009910_MICRO_TILE_MODE_NEW(x)                             0xFE3FFFFF
+#define   C_009910_MICRO_TILE_MODE_NEW                                0xFE3FFFFF
 #define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
 #define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
 #define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
 #define     V_009910_ADDR_SURF_ROTATED_MICRO_TILING                 0x03
+#define   S_009910_SAMPLE_SPLIT(x)                                    (((x) & 0x03) << 25)
+#define   G_009910_SAMPLE_SPLIT(x)                                    (((x) >> 25) & 0x03)
+#define   C_009910_SAMPLE_SPLIT                                       0xF9FFFFFF
+#define R_009914_GB_TILE_MODE1                                          0x009914
+#define R_009918_GB_TILE_MODE2                                          0x009918
+#define R_00991C_GB_TILE_MODE3                                          0x00991C
+#define R_009920_GB_TILE_MODE4                                          0x009920
+#define R_009924_GB_TILE_MODE5                                          0x009924
+#define R_009928_GB_TILE_MODE6                                          0x009928
+#define R_00992C_GB_TILE_MODE7                                          0x00992C
+#define R_009930_GB_TILE_MODE8                                          0x009930
+#define R_009934_GB_TILE_MODE9                                          0x009934
+#define R_009938_GB_TILE_MODE10                                         0x009938
+#define R_00993C_GB_TILE_MODE11                                         0x00993C
+#define R_009940_GB_TILE_MODE12                                         0x009940
+#define R_009944_GB_TILE_MODE13                                         0x009944
+#define R_009948_GB_TILE_MODE14                                         0x009948
+#define R_00994C_GB_TILE_MODE15                                         0x00994C
+#define R_009950_GB_TILE_MODE16                                         0x009950
+#define R_009954_GB_TILE_MODE17                                         0x009954
+#define R_009958_GB_TILE_MODE18                                         0x009958
+#define R_00995C_GB_TILE_MODE19                                         0x00995C
+#define R_009960_GB_TILE_MODE20                                         0x009960
+#define R_009964_GB_TILE_MODE21                                         0x009964
+#define R_009968_GB_TILE_MODE22                                         0x009968
+#define R_00996C_GB_TILE_MODE23                                         0x00996C
+#define R_009970_GB_TILE_MODE24                                         0x009970
+#define R_009974_GB_TILE_MODE25                                         0x009974
+#define R_009978_GB_TILE_MODE26                                         0x009978
+#define R_00997C_GB_TILE_MODE27                                         0x00997C
+#define R_009980_GB_TILE_MODE28                                         0x009980
+#define R_009984_GB_TILE_MODE29                                         0x009984
+#define R_009988_GB_TILE_MODE30                                         0x009988
+#define R_00998C_GB_TILE_MODE31                                         0x00998C
+/* CIK */
+#define R_009990_GB_MACROTILE_MODE0                                     0x009990
+#define   S_009990_BANK_WIDTH(x)                                      (((x) & 0x03) << 0)
+#define   G_009990_BANK_WIDTH(x)                                      (((x) >> 0) & 0x03)
+#define   C_009990_BANK_WIDTH                                         0xFFFFFFFC
+#define   S_009990_BANK_HEIGHT(x)                                     (((x) & 0x03) << 2)
+#define   G_009990_BANK_HEIGHT(x)                                     (((x) >> 2) & 0x03)
+#define   C_009990_BANK_HEIGHT                                        0xFFFFFFF3
+#define   S_009990_MACRO_TILE_ASPECT(x)                               (((x) & 0x03) << 4)
+#define   G_009990_MACRO_TILE_ASPECT(x)                               (((x) >> 4) & 0x03)
+#define   C_009990_MACRO_TILE_ASPECT                                  0xFFFFFFCF
+#define   S_009990_NUM_BANKS(x)                                       (((x) & 0x03) << 6)
+#define   G_009990_NUM_BANKS(x)                                       (((x) >> 6) & 0x03)
+#define   C_009990_NUM_BANKS                                          0xFFFFFF3F
+#define R_009994_GB_MACROTILE_MODE1                                     0x009994
+#define R_009998_GB_MACROTILE_MODE2                                     0x009998
+#define R_00999C_GB_MACROTILE_MODE3                                     0x00999C
+#define R_0099A0_GB_MACROTILE_MODE4                                     0x0099A0
+#define R_0099A4_GB_MACROTILE_MODE5                                     0x0099A4
+#define R_0099A8_GB_MACROTILE_MODE6                                     0x0099A8
+#define R_0099AC_GB_MACROTILE_MODE7                                     0x0099AC
+#define R_0099B0_GB_MACROTILE_MODE8                                     0x0099B0
+#define R_0099B4_GB_MACROTILE_MODE9                                     0x0099B4
+#define R_0099B8_GB_MACROTILE_MODE10                                    0x0099B8
+#define R_0099BC_GB_MACROTILE_MODE11                                    0x0099BC
+#define R_0099C0_GB_MACROTILE_MODE12                                    0x0099C0
+#define R_0099C4_GB_MACROTILE_MODE13                                    0x0099C4
+#define R_0099C8_GB_MACROTILE_MODE14                                    0x0099C8
+#define R_0099CC_GB_MACROTILE_MODE15                                    0x0099CC
+/*     */
+#define R_00B000_SPI_SHADER_TBA_LO_PS                                   0x00B000
+#define R_00B004_SPI_SHADER_TBA_HI_PS                                   0x00B004
+#define   S_00B004_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B004_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B004_MEM_BASE                                           0xFFFFFF00
+#define R_00B008_SPI_SHADER_TMA_LO_PS                                   0x00B008
+#define R_00B00C_SPI_SHADER_TMA_HI_PS                                   0x00B00C
+#define   S_00B00C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B00C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B00C_MEM_BASE                                           0xFFFFFF00
+/* CIK */
 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS                                0x00B01C
 #define   S_00B01C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
 #define   G_00B01C_CU_EN(x)                                           (((x) >> 0) & 0xFFFF)
@@ -4582,6 +5996,9 @@
 #define   S_00B02C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B02C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B02C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B02C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B02C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B02C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B02C_WAVE_CNT_EN(x)                                     (((x) & 0x1) << 7)
 #define   G_00B02C_WAVE_CNT_EN(x)                                     (((x) >> 7) & 0x1)
 #define   C_00B02C_WAVE_CNT_EN                                        0xFFFFFF7F
@@ -4591,6 +6008,9 @@
 #define   S_00B02C_EXCP_EN(x)                                         (((x) & 0x7F) << 16) /* mask is 0x1FF on CIK */
 #define   G_00B02C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B02C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
+#define   S_00B02C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 16)
+#define   G_00B02C_EXCP_EN_CIK(x)                                     (((x) >> 16) & 0x1FF)
+#define   C_00B02C_EXCP_EN_CIK                                        0xFE00FFFF
 #define R_00B030_SPI_SHADER_USER_DATA_PS_0                              0x00B030
 #define R_00B034_SPI_SHADER_USER_DATA_PS_1                              0x00B034
 #define R_00B038_SPI_SHADER_USER_DATA_PS_2                              0x00B038
@@ -4607,6 +6027,16 @@
 #define R_00B064_SPI_SHADER_USER_DATA_PS_13                             0x00B064
 #define R_00B068_SPI_SHADER_USER_DATA_PS_14                             0x00B068
 #define R_00B06C_SPI_SHADER_USER_DATA_PS_15                             0x00B06C
+#define R_00B100_SPI_SHADER_TBA_LO_VS                                   0x00B100
+#define R_00B104_SPI_SHADER_TBA_HI_VS                                   0x00B104
+#define   S_00B104_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B104_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B104_MEM_BASE                                           0xFFFFFF00
+#define R_00B108_SPI_SHADER_TMA_LO_VS                                   0x00B108
+#define R_00B10C_SPI_SHADER_TMA_HI_VS                                   0x00B10C
+#define   S_00B10C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B10C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B10C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B118_SPI_SHADER_PGM_RSRC3_VS                                0x00B118
 #define   S_00B118_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4674,6 +6104,9 @@
 #define   S_00B12C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B12C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B12C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B12C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B12C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B12C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B12C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B12C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B12C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4695,6 +6128,14 @@
 #define   S_00B12C_EXCP_EN(x)                                         (((x) & 0x7F) << 13) /* mask is 0x1FF on CIK */
 #define   G_00B12C_EXCP_EN(x)                                         (((x) >> 13) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B12C_EXCP_EN                                            0xFFF01FFF /* mask is 0x1FF on CIK */
+#define   S_00B12C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 13)
+#define   G_00B12C_EXCP_EN_CIK(x)                                     (((x) >> 13) & 0x1FF)
+#define   C_00B12C_EXCP_EN_CIK                                        0xFFC01FFF
+/* VI */
+#define   S_00B12C_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 24)
+#define   G_00B12C_DISPATCH_DRAW_EN(x)                                (((x) >> 24) & 0x1)
+#define   C_00B12C_DISPATCH_DRAW_EN                                   0xFEFFFFFF
+/*    */
 #define R_00B130_SPI_SHADER_USER_DATA_VS_0                              0x00B130
 #define R_00B134_SPI_SHADER_USER_DATA_VS_1                              0x00B134
 #define R_00B138_SPI_SHADER_USER_DATA_VS_2                              0x00B138
@@ -4711,6 +6152,16 @@
 #define R_00B164_SPI_SHADER_USER_DATA_VS_13                             0x00B164
 #define R_00B168_SPI_SHADER_USER_DATA_VS_14                             0x00B168
 #define R_00B16C_SPI_SHADER_USER_DATA_VS_15                             0x00B16C
+#define R_00B200_SPI_SHADER_TBA_LO_GS                                   0x00B200
+#define R_00B204_SPI_SHADER_TBA_HI_GS                                   0x00B204
+#define   S_00B204_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B204_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B204_MEM_BASE                                           0xFFFFFF00
+#define R_00B208_SPI_SHADER_TMA_LO_GS                                   0x00B208
+#define R_00B20C_SPI_SHADER_TMA_HI_GS                                   0x00B20C
+#define   S_00B20C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B20C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B20C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B21C_SPI_SHADER_PGM_RSRC3_GS                                0x00B21C
 #define   S_00B21C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4723,6 +6174,11 @@
 #define   G_00B21C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B21C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B21C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B220_SPI_SHADER_PGM_LO_GS                                   0x00B220
 #define R_00B224_SPI_SHADER_PGM_HI_GS                                   0x00B224
 #define   S_00B224_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4771,10 +6227,41 @@
 #define   S_00B22C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B22C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B22C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B22C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B22C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B22C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B22C_EXCP_EN(x)                                         (((x) & 0x7F) << 7) /* mask is 0x1FF on CIK */
 #define   G_00B22C_EXCP_EN(x)                                         (((x) >> 7) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B22C_EXCP_EN                                            0xFFFFC07F /* mask is 0x1FF on CIK */
+#define   S_00B22C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 7)
+#define   G_00B22C_EXCP_EN_CIK(x)                                     (((x) >> 7) & 0x1FF)
+#define   C_00B22C_EXCP_EN_CIK                                        0xFFFF007F
 #define R_00B230_SPI_SHADER_USER_DATA_GS_0                              0x00B230
+#define R_00B234_SPI_SHADER_USER_DATA_GS_1                              0x00B234
+#define R_00B238_SPI_SHADER_USER_DATA_GS_2                              0x00B238
+#define R_00B23C_SPI_SHADER_USER_DATA_GS_3                              0x00B23C
+#define R_00B240_SPI_SHADER_USER_DATA_GS_4                              0x00B240
+#define R_00B244_SPI_SHADER_USER_DATA_GS_5                              0x00B244
+#define R_00B248_SPI_SHADER_USER_DATA_GS_6                              0x00B248
+#define R_00B24C_SPI_SHADER_USER_DATA_GS_7                              0x00B24C
+#define R_00B250_SPI_SHADER_USER_DATA_GS_8                              0x00B250
+#define R_00B254_SPI_SHADER_USER_DATA_GS_9                              0x00B254
+#define R_00B258_SPI_SHADER_USER_DATA_GS_10                             0x00B258
+#define R_00B25C_SPI_SHADER_USER_DATA_GS_11                             0x00B25C
+#define R_00B260_SPI_SHADER_USER_DATA_GS_12                             0x00B260
+#define R_00B264_SPI_SHADER_USER_DATA_GS_13                             0x00B264
+#define R_00B268_SPI_SHADER_USER_DATA_GS_14                             0x00B268
+#define R_00B26C_SPI_SHADER_USER_DATA_GS_15                             0x00B26C
+#define R_00B300_SPI_SHADER_TBA_LO_ES                                   0x00B300
+#define R_00B304_SPI_SHADER_TBA_HI_ES                                   0x00B304
+#define   S_00B304_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B304_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B304_MEM_BASE                                           0xFFFFFF00
+#define R_00B308_SPI_SHADER_TMA_LO_ES                                   0x00B308
+#define R_00B30C_SPI_SHADER_TMA_HI_ES                                   0x00B30C
+#define   S_00B30C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B30C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B30C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B31C_SPI_SHADER_PGM_RSRC3_ES                                0x00B31C
 #define   S_00B31C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4787,6 +6274,11 @@
 #define   G_00B31C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B31C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B31C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B320_SPI_SHADER_PGM_LO_ES                                   0x00B320
 #define R_00B324_SPI_SHADER_PGM_HI_ES                                   0x00B324
 #define   S_00B324_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4838,6 +6330,9 @@
 #define   S_00B32C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B32C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B32C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B32C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B32C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B32C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B32C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B32C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B32C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4848,6 +6343,31 @@
 #define   G_00B32C_LDS_SIZE(x)                                        (((x) >> 20) & 0x1FF) /* CIK, for on-chip GS */
 #define   C_00B32C_LDS_SIZE                                           0xE00FFFFF /* CIK, for on-chip GS */
 #define R_00B330_SPI_SHADER_USER_DATA_ES_0                              0x00B330
+#define R_00B334_SPI_SHADER_USER_DATA_ES_1                              0x00B334
+#define R_00B338_SPI_SHADER_USER_DATA_ES_2                              0x00B338
+#define R_00B33C_SPI_SHADER_USER_DATA_ES_3                              0x00B33C
+#define R_00B340_SPI_SHADER_USER_DATA_ES_4                              0x00B340
+#define R_00B344_SPI_SHADER_USER_DATA_ES_5                              0x00B344
+#define R_00B348_SPI_SHADER_USER_DATA_ES_6                              0x00B348
+#define R_00B34C_SPI_SHADER_USER_DATA_ES_7                              0x00B34C
+#define R_00B350_SPI_SHADER_USER_DATA_ES_8                              0x00B350
+#define R_00B354_SPI_SHADER_USER_DATA_ES_9                              0x00B354
+#define R_00B358_SPI_SHADER_USER_DATA_ES_10                             0x00B358
+#define R_00B35C_SPI_SHADER_USER_DATA_ES_11                             0x00B35C
+#define R_00B360_SPI_SHADER_USER_DATA_ES_12                             0x00B360
+#define R_00B364_SPI_SHADER_USER_DATA_ES_13                             0x00B364
+#define R_00B368_SPI_SHADER_USER_DATA_ES_14                             0x00B368
+#define R_00B36C_SPI_SHADER_USER_DATA_ES_15                             0x00B36C
+#define R_00B400_SPI_SHADER_TBA_LO_HS                                   0x00B400
+#define R_00B404_SPI_SHADER_TBA_HI_HS                                   0x00B404
+#define   S_00B404_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B404_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B404_MEM_BASE                                           0xFFFFFF00
+#define R_00B408_SPI_SHADER_TMA_LO_HS                                   0x00B408
+#define R_00B40C_SPI_SHADER_TMA_HI_HS                                   0x00B40C
+#define   S_00B40C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B40C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B40C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B41C_SPI_SHADER_PGM_RSRC3_HS                                0x00B41C
 #define   S_00B41C_WAVE_LIMIT(x)                                      (((x) & 0x3F) << 0)
@@ -4857,6 +6377,11 @@
 #define   G_00B41C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 6) & 0x0F)
 #define   C_00B41C_LOCK_LOW_THRESHOLD                                 0xFFFFFC3F
 /*     */
+/* VI */
+#define   S_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 10)
+#define   G_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) >> 10) & 0x3F)
+#define   C_00B41C_GROUP_FIFO_DEPTH                                   0xFFFF03FF
+/*    */
 #define R_00B420_SPI_SHADER_PGM_LO_HS                                   0x00B420
 #define R_00B424_SPI_SHADER_PGM_HI_HS                                   0x00B424
 #define   S_00B424_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4902,6 +6427,9 @@
 #define   S_00B42C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B42C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B42C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B42C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B42C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B42C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B42C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B42C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B42C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4912,6 +6440,31 @@
 #define   G_00B42C_EXCP_EN(x)                                         (((x) >> 9) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B42C_EXCP_EN                                            0xFFFF01FF /* mask is 0x1FF on CIK */
 #define R_00B430_SPI_SHADER_USER_DATA_HS_0                              0x00B430
+#define R_00B434_SPI_SHADER_USER_DATA_HS_1                              0x00B434
+#define R_00B438_SPI_SHADER_USER_DATA_HS_2                              0x00B438
+#define R_00B43C_SPI_SHADER_USER_DATA_HS_3                              0x00B43C
+#define R_00B440_SPI_SHADER_USER_DATA_HS_4                              0x00B440
+#define R_00B444_SPI_SHADER_USER_DATA_HS_5                              0x00B444
+#define R_00B448_SPI_SHADER_USER_DATA_HS_6                              0x00B448
+#define R_00B44C_SPI_SHADER_USER_DATA_HS_7                              0x00B44C
+#define R_00B450_SPI_SHADER_USER_DATA_HS_8                              0x00B450
+#define R_00B454_SPI_SHADER_USER_DATA_HS_9                              0x00B454
+#define R_00B458_SPI_SHADER_USER_DATA_HS_10                             0x00B458
+#define R_00B45C_SPI_SHADER_USER_DATA_HS_11                             0x00B45C
+#define R_00B460_SPI_SHADER_USER_DATA_HS_12                             0x00B460
+#define R_00B464_SPI_SHADER_USER_DATA_HS_13                             0x00B464
+#define R_00B468_SPI_SHADER_USER_DATA_HS_14                             0x00B468
+#define R_00B46C_SPI_SHADER_USER_DATA_HS_15                             0x00B46C
+#define R_00B500_SPI_SHADER_TBA_LO_LS                                   0x00B500
+#define R_00B504_SPI_SHADER_TBA_HI_LS                                   0x00B504
+#define   S_00B504_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B504_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B504_MEM_BASE                                           0xFFFFFF00
+#define R_00B508_SPI_SHADER_TMA_LO_LS                                   0x00B508
+#define R_00B50C_SPI_SHADER_TMA_HI_LS                                   0x00B50C
+#define   S_00B50C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B50C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B50C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B51C_SPI_SHADER_PGM_RSRC3_LS                                0x00B51C
 #define   S_00B51C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4924,6 +6477,11 @@
 #define   G_00B51C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B51C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B51C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B520_SPI_SHADER_PGM_LO_LS                                   0x00B520
 #define R_00B524_SPI_SHADER_PGM_HI_LS                                   0x00B524
 #define   S_00B524_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4972,6 +6530,9 @@
 #define   S_00B52C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B52C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B52C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B52C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B52C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B52C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B52C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 7)
 #define   G_00B52C_LDS_SIZE(x)                                        (((x) >> 7) & 0x1FF)
 #define   C_00B52C_LDS_SIZE                                           0xFFFF007F
@@ -4979,6 +6540,21 @@
 #define   G_00B52C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B52C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
 #define R_00B530_SPI_SHADER_USER_DATA_LS_0                              0x00B530
+#define R_00B534_SPI_SHADER_USER_DATA_LS_1                              0x00B534
+#define R_00B538_SPI_SHADER_USER_DATA_LS_2                              0x00B538
+#define R_00B53C_SPI_SHADER_USER_DATA_LS_3                              0x00B53C
+#define R_00B540_SPI_SHADER_USER_DATA_LS_4                              0x00B540
+#define R_00B544_SPI_SHADER_USER_DATA_LS_5                              0x00B544
+#define R_00B548_SPI_SHADER_USER_DATA_LS_6                              0x00B548
+#define R_00B54C_SPI_SHADER_USER_DATA_LS_7                              0x00B54C
+#define R_00B550_SPI_SHADER_USER_DATA_LS_8                              0x00B550
+#define R_00B554_SPI_SHADER_USER_DATA_LS_9                              0x00B554
+#define R_00B558_SPI_SHADER_USER_DATA_LS_10                             0x00B558
+#define R_00B55C_SPI_SHADER_USER_DATA_LS_11                             0x00B55C
+#define R_00B560_SPI_SHADER_USER_DATA_LS_12                             0x00B560
+#define R_00B564_SPI_SHADER_USER_DATA_LS_13                             0x00B564
+#define R_00B568_SPI_SHADER_USER_DATA_LS_14                             0x00B568
+#define R_00B56C_SPI_SHADER_USER_DATA_LS_15                             0x00B56C
 #define R_00B800_COMPUTE_DISPATCH_INITIATOR                             0x00B800
 #define   S_00B800_COMPUTE_SHADER_EN(x)                               (((x) & 0x1) << 0)
 #define   G_00B800_COMPUTE_SHADER_EN(x)                               (((x) >> 0) & 0x1)
@@ -5049,6 +6625,16 @@
 #define   S_00B82C_MAX_WAVE_ID(x)                                     (((x) & 0xFFF) << 0)
 #define   G_00B82C_MAX_WAVE_ID(x)                                     (((x) >> 0) & 0xFFF)
 #define   C_00B82C_MAX_WAVE_ID                                        0xFFFFF000
+/* CIK */
+#define R_00B828_COMPUTE_PIPELINESTAT_ENABLE                            0x00B828
+#define   S_00B828_PIPELINESTAT_ENABLE(x)                             (((x) & 0x1) << 0)
+#define   G_00B828_PIPELINESTAT_ENABLE(x)                             (((x) >> 0) & 0x1)
+#define   C_00B828_PIPELINESTAT_ENABLE                                0xFFFFFFFE
+#define R_00B82C_COMPUTE_PERFCOUNT_ENABLE                               0x00B82C
+#define   S_00B82C_PERFCOUNT_ENABLE(x)                                (((x) & 0x1) << 0)
+#define   G_00B82C_PERFCOUNT_ENABLE(x)                                (((x) >> 0) & 0x1)
+#define   C_00B82C_PERFCOUNT_ENABLE                                   0xFFFFFFFE
+/*     */
 #define R_00B830_COMPUTE_PGM_LO                                         0x00B830
 #define R_00B834_COMPUTE_PGM_HI                                         0x00B834
 #define   S_00B834_DATA(x)                                            (((x) & 0xFF) << 0)
@@ -5059,6 +6645,16 @@
 #define   G_00B834_INST_ATC(x)                                        (((x) >> 8) & 0x1)
 #define   C_00B834_INST_ATC                                           0xFFFFFEFF
 /*     */
+#define R_00B838_COMPUTE_TBA_LO                                         0x00B838
+#define R_00B83C_COMPUTE_TBA_HI                                         0x00B83C
+#define   S_00B83C_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B83C_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B83C_DATA                                               0xFFFFFF00
+#define R_00B840_COMPUTE_TMA_LO                                         0x00B840
+#define R_00B844_COMPUTE_TMA_HI                                         0x00B844
+#define   S_00B844_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B844_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B844_DATA                                               0xFFFFFF00
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
@@ -5099,6 +6695,9 @@
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -5125,6 +6724,10 @@
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
 #define   C_00B84C_EXCP_EN                                            0x80FFFFFF
+#define R_00B850_COMPUTE_VMID                                           0x00B850
+#define   S_00B850_DATA(x)                                            (((x) & 0x0F) << 0)
+#define   G_00B850_DATA(x)                                            (((x) >> 0) & 0x0F)
+#define   C_00B850_DATA                                               0xFFFFFFF0
 #define R_00B854_COMPUTE_RESOURCE_LIMITS                                0x00B854
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
@@ -5167,7 +6770,84 @@
 #define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 #define   G_00B860_WAVESIZE(x)                                        (((x) >> 12) & 0x1FFF)
 #define   C_00B860_WAVESIZE                                           0xFE000FFF
+/* CIK */
+#define R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2                         0x00B864
+#define   S_00B864_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B864_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B864_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B864_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B864_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B864_SH1_CU_EN                                          0x0000FFFF
+#define R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3                         0x00B868
+#define   S_00B868_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B868_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B868_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B868_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B868_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B868_SH1_CU_EN                                          0x0000FFFF
+#define R_00B86C_COMPUTE_RESTART_X                                      0x00B86C
+#define R_00B870_COMPUTE_RESTART_Y                                      0x00B870
+#define R_00B874_COMPUTE_RESTART_Z                                      0x00B874
+#define R_00B87C_COMPUTE_MISC_RESERVED                                  0x00B87C
+#define   S_00B87C_SEND_SEID(x)                                       (((x) & 0x03) << 0)
+#define   G_00B87C_SEND_SEID(x)                                       (((x) >> 0) & 0x03)
+#define   C_00B87C_SEND_SEID                                          0xFFFFFFFC
+#define   S_00B87C_RESERVED2(x)                                       (((x) & 0x1) << 2)
+#define   G_00B87C_RESERVED2(x)                                       (((x) >> 2) & 0x1)
+#define   C_00B87C_RESERVED2                                          0xFFFFFFFB
+#define   S_00B87C_RESERVED3(x)                                       (((x) & 0x1) << 3)
+#define   G_00B87C_RESERVED3(x)                                       (((x) >> 3) & 0x1)
+#define   C_00B87C_RESERVED3                                          0xFFFFFFF7
+#define   S_00B87C_RESERVED4(x)                                       (((x) & 0x1) << 4)
+#define   G_00B87C_RESERVED4(x)                                       (((x) >> 4) & 0x1)
+#define   C_00B87C_RESERVED4                                          0xFFFFFFEF
+/* VI */
+#define   S_00B87C_WAVE_ID_BASE(x)                                    (((x) & 0xFFF) << 5)
+#define   G_00B87C_WAVE_ID_BASE(x)                                    (((x) >> 5) & 0xFFF)
+#define   C_00B87C_WAVE_ID_BASE                                       0xFFFE001F
+#define R_00B880_COMPUTE_DISPATCH_ID                                    0x00B880
+#define R_00B884_COMPUTE_THREADGROUP_ID                                 0x00B884
+#define R_00B888_COMPUTE_RELAUNCH                                       0x00B888
+#define   S_00B888_PAYLOAD(x)                                         (((x) & 0x3FFFFFFF) << 0)
+#define   G_00B888_PAYLOAD(x)                                         (((x) >> 0) & 0x3FFFFFFF)
+#define   C_00B888_PAYLOAD                                            0xC0000000
+#define   S_00B888_IS_EVENT(x)                                        (((x) & 0x1) << 30)
+#define   G_00B888_IS_EVENT(x)                                        (((x) >> 30) & 0x1)
+#define   C_00B888_IS_EVENT                                           0xBFFFFFFF
+#define   S_00B888_IS_STATE(x)                                        (((x) & 0x1) << 31)
+#define   G_00B888_IS_STATE(x)                                        (((x) >> 31) & 0x1)
+#define   C_00B888_IS_STATE                                           0x7FFFFFFF
+#define R_00B88C_COMPUTE_WAVE_RESTORE_ADDR_LO                           0x00B88C
+#define R_00B890_COMPUTE_WAVE_RESTORE_ADDR_HI                           0x00B890
+#define   S_00B890_ADDR(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_00B890_ADDR(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_00B890_ADDR                                               0xFFFF0000
+#define R_00B894_COMPUTE_WAVE_RESTORE_CONTROL                           0x00B894
+#define   S_00B894_ATC(x)                                             (((x) & 0x1) << 0)
+#define   G_00B894_ATC(x)                                             (((x) >> 0) & 0x1)
+#define   C_00B894_ATC                                                0xFFFFFFFE
+#define   S_00B894_MTYPE(x)                                           (((x) & 0x03) << 1)
+#define   G_00B894_MTYPE(x)                                           (((x) >> 1) & 0x03)
+#define   C_00B894_MTYPE                                              0xFFFFFFF9
+/*    */
+/*     */
 #define R_00B900_COMPUTE_USER_DATA_0                                    0x00B900
+#define R_00B904_COMPUTE_USER_DATA_1                                    0x00B904
+#define R_00B908_COMPUTE_USER_DATA_2                                    0x00B908
+#define R_00B90C_COMPUTE_USER_DATA_3                                    0x00B90C
+#define R_00B910_COMPUTE_USER_DATA_4                                    0x00B910
+#define R_00B914_COMPUTE_USER_DATA_5                                    0x00B914
+#define R_00B918_COMPUTE_USER_DATA_6                                    0x00B918
+#define R_00B91C_COMPUTE_USER_DATA_7                                    0x00B91C
+#define R_00B920_COMPUTE_USER_DATA_8                                    0x00B920
+#define R_00B924_COMPUTE_USER_DATA_9                                    0x00B924
+#define R_00B928_COMPUTE_USER_DATA_10                                   0x00B928
+#define R_00B92C_COMPUTE_USER_DATA_11                                   0x00B92C
+#define R_00B930_COMPUTE_USER_DATA_12                                   0x00B930
+#define R_00B934_COMPUTE_USER_DATA_13                                   0x00B934
+#define R_00B938_COMPUTE_USER_DATA_14                                   0x00B938
+#define R_00B93C_COMPUTE_USER_DATA_15                                   0x00B93C
+#define R_00B9FC_COMPUTE_NOWHERE                                        0x00B9FC
 #define R_028000_DB_RENDER_CONTROL                                      0x028000
 #define   S_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) & 0x1) << 0)
 #define   G_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) >> 0) & 0x1)
@@ -5196,6 +6876,11 @@
 #define   S_028000_COPY_SAMPLE(x)                                     (((x) & 0x0F) << 8)
 #define   G_028000_COPY_SAMPLE(x)                                     (((x) >> 8) & 0x0F)
 #define   C_028000_COPY_SAMPLE                                        0xFFFFF0FF
+/* VI */
+#define   S_028000_DECOMPRESS_ENABLE(x)                               (((x) & 0x1) << 12)
+#define   G_028000_DECOMPRESS_ENABLE(x)                               (((x) >> 12) & 0x1)
+#define   C_028000_DECOMPRESS_ENABLE                                  0xFFFFEFFF
+/*    */
 #define R_028004_DB_COUNT_CONTROL                                       0x028004
 #define   S_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) & 0x1) << 0)
 #define   G_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) >> 0) & 0x1)
@@ -5474,9 +7159,6 @@
 #define   S_028040_NUM_SAMPLES(x)                                     (((x) & 0x03) << 2)
 #define   G_028040_NUM_SAMPLES(x)                                     (((x) >> 2) & 0x03)
 #define   C_028040_NUM_SAMPLES                                        0xFFFFFFF3
-#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028040_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028040_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5489,6 +7171,14 @@
 #define     V_028040_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028040_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
+/* VI */
+#define   S_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) & 0x0F) << 23)
+#define   G_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) >> 23) & 0x0F)
+#define   C_028040_DECOMPRESS_ON_N_ZPLANES                            0xF87FFFFF
+/*    */
 #define   S_028040_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028040_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028040_ALLOW_EXPCLEAR                                     0xF7FFFFFF
@@ -5498,6 +7188,11 @@
 #define   S_028040_TILE_SURFACE_ENABLE(x)                             (((x) & 0x1) << 29)
 #define   G_028040_TILE_SURFACE_ENABLE(x)                             (((x) >> 29) & 0x1)
 #define   C_028040_TILE_SURFACE_ENABLE                                0xDFFFFFFF
+/* VI */
+#define   S_028040_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028040_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028040_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define   S_028040_ZRANGE_PRECISION(x)                                (((x) & 0x1) << 31)
 #define   G_028040_ZRANGE_PRECISION(x)                                (((x) >> 31) & 0x1)
 #define   C_028040_ZRANGE_PRECISION                                   0x7FFFFFFF
@@ -5507,9 +7202,6 @@
 #define   C_028044_FORMAT                                             0xFFFFFFFE
 #define     V_028044_STENCIL_INVALID                                0x00
 #define     V_028044_STENCIL_8                                      0x01
-#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028044_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028044_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5522,12 +7214,20 @@
 #define     V_028044_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028044_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 #define   S_028044_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028044_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028044_ALLOW_EXPCLEAR                                     0xF7FFFFFF
 #define   S_028044_TILE_STENCIL_DISABLE(x)                            (((x) & 0x1) << 29)
 #define   G_028044_TILE_STENCIL_DISABLE(x)                            (((x) >> 29) & 0x1)
 #define   C_028044_TILE_STENCIL_DISABLE                               0xDFFFFFFF
+/* VI */
+#define   S_028044_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028044_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028044_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define R_028048_DB_Z_READ_BASE                                         0x028048
 #define R_02804C_DB_STENCIL_READ_BASE                                   0x02804C
 #define R_028050_DB_Z_WRITE_BASE                                        0x028050
@@ -5549,7 +7249,13 @@
 #define   S_028084_ADDRESS(x)                                         (((x) & 0xFF) << 0)
 #define   G_028084_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
 #define   C_028084_ADDRESS                                            0xFFFFFF00
-/* */
+#define R_0281E8_COHER_DEST_BASE_HI_0                                   0x0281E8
+#define R_0281EC_COHER_DEST_BASE_HI_1                                   0x0281EC
+#define R_0281F0_COHER_DEST_BASE_HI_2                                   0x0281F0
+#define R_0281F4_COHER_DEST_BASE_HI_3                                   0x0281F4
+/*     */
+#define R_0281F8_COHER_DEST_BASE_2                                      0x0281F8
+#define R_0281FC_COHER_DEST_BASE_3                                      0x0281FC
 #define R_028200_PA_SC_WINDOW_OFFSET                                    0x028200
 #define   S_028200_WINDOW_X_OFFSET(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_028200_WINDOW_X_OFFSET(x)                                 (((x) >> 0) & 0xFFFF)
@@ -5694,6 +7400,8 @@
 #define   S_028244_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028244_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028244_BR_Y                                               0x8000FFFF
+#define R_028248_COHER_DEST_BASE_0                                      0x028248
+#define R_02824C_COHER_DEST_BASE_1                                      0x02824C
 #define R_028250_PA_SC_VPORT_SCISSOR_0_TL                               0x028250
 #define   S_028250_TL_X(x)                                            (((x) & 0x7FFF) << 0)
 #define   G_028250_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
@@ -5711,8 +7419,68 @@
 #define   S_028254_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028254_BR_Y                                               0x8000FFFF
+#define R_028258_PA_SC_VPORT_SCISSOR_1_TL                               0x028258
+#define R_02825C_PA_SC_VPORT_SCISSOR_1_BR                               0x02825C
+#define R_028260_PA_SC_VPORT_SCISSOR_2_TL                               0x028260
+#define R_028264_PA_SC_VPORT_SCISSOR_2_BR                               0x028264
+#define R_028268_PA_SC_VPORT_SCISSOR_3_TL                               0x028268
+#define R_02826C_PA_SC_VPORT_SCISSOR_3_BR                               0x02826C
+#define R_028270_PA_SC_VPORT_SCISSOR_4_TL                               0x028270
+#define R_028274_PA_SC_VPORT_SCISSOR_4_BR                               0x028274
+#define R_028278_PA_SC_VPORT_SCISSOR_5_TL                               0x028278
+#define R_02827C_PA_SC_VPORT_SCISSOR_5_BR                               0x02827C
+#define R_028280_PA_SC_VPORT_SCISSOR_6_TL                               0x028280
+#define R_028284_PA_SC_VPORT_SCISSOR_6_BR                               0x028284
+#define R_028288_PA_SC_VPORT_SCISSOR_7_TL                               0x028288
+#define R_02828C_PA_SC_VPORT_SCISSOR_7_BR                               0x02828C
+#define R_028290_PA_SC_VPORT_SCISSOR_8_TL                               0x028290
+#define R_028294_PA_SC_VPORT_SCISSOR_8_BR                               0x028294
+#define R_028298_PA_SC_VPORT_SCISSOR_9_TL                               0x028298
+#define R_02829C_PA_SC_VPORT_SCISSOR_9_BR                               0x02829C
+#define R_0282A0_PA_SC_VPORT_SCISSOR_10_TL                              0x0282A0
+#define R_0282A4_PA_SC_VPORT_SCISSOR_10_BR                              0x0282A4
+#define R_0282A8_PA_SC_VPORT_SCISSOR_11_TL                              0x0282A8
+#define R_0282AC_PA_SC_VPORT_SCISSOR_11_BR                              0x0282AC
+#define R_0282B0_PA_SC_VPORT_SCISSOR_12_TL                              0x0282B0
+#define R_0282B4_PA_SC_VPORT_SCISSOR_12_BR                              0x0282B4
+#define R_0282B8_PA_SC_VPORT_SCISSOR_13_TL                              0x0282B8
+#define R_0282BC_PA_SC_VPORT_SCISSOR_13_BR                              0x0282BC
+#define R_0282C0_PA_SC_VPORT_SCISSOR_14_TL                              0x0282C0
+#define R_0282C4_PA_SC_VPORT_SCISSOR_14_BR                              0x0282C4
+#define R_0282C8_PA_SC_VPORT_SCISSOR_15_TL                              0x0282C8
+#define R_0282CC_PA_SC_VPORT_SCISSOR_15_BR                              0x0282CC
 #define R_0282D0_PA_SC_VPORT_ZMIN_0                                     0x0282D0
 #define R_0282D4_PA_SC_VPORT_ZMAX_0                                     0x0282D4
+#define R_0282D8_PA_SC_VPORT_ZMIN_1                                     0x0282D8
+#define R_0282DC_PA_SC_VPORT_ZMAX_1                                     0x0282DC
+#define R_0282E0_PA_SC_VPORT_ZMIN_2                                     0x0282E0
+#define R_0282E4_PA_SC_VPORT_ZMAX_2                                     0x0282E4
+#define R_0282E8_PA_SC_VPORT_ZMIN_3                                     0x0282E8
+#define R_0282EC_PA_SC_VPORT_ZMAX_3                                     0x0282EC
+#define R_0282F0_PA_SC_VPORT_ZMIN_4                                     0x0282F0
+#define R_0282F4_PA_SC_VPORT_ZMAX_4                                     0x0282F4
+#define R_0282F8_PA_SC_VPORT_ZMIN_5                                     0x0282F8
+#define R_0282FC_PA_SC_VPORT_ZMAX_5                                     0x0282FC
+#define R_028300_PA_SC_VPORT_ZMIN_6                                     0x028300
+#define R_028304_PA_SC_VPORT_ZMAX_6                                     0x028304
+#define R_028308_PA_SC_VPORT_ZMIN_7                                     0x028308
+#define R_02830C_PA_SC_VPORT_ZMAX_7                                     0x02830C
+#define R_028310_PA_SC_VPORT_ZMIN_8                                     0x028310
+#define R_028314_PA_SC_VPORT_ZMAX_8                                     0x028314
+#define R_028318_PA_SC_VPORT_ZMIN_9                                     0x028318
+#define R_02831C_PA_SC_VPORT_ZMAX_9                                     0x02831C
+#define R_028320_PA_SC_VPORT_ZMIN_10                                    0x028320
+#define R_028324_PA_SC_VPORT_ZMAX_10                                    0x028324
+#define R_028328_PA_SC_VPORT_ZMIN_11                                    0x028328
+#define R_02832C_PA_SC_VPORT_ZMAX_11                                    0x02832C
+#define R_028330_PA_SC_VPORT_ZMIN_12                                    0x028330
+#define R_028334_PA_SC_VPORT_ZMAX_12                                    0x028334
+#define R_028338_PA_SC_VPORT_ZMIN_13                                    0x028338
+#define R_02833C_PA_SC_VPORT_ZMAX_13                                    0x02833C
+#define R_028340_PA_SC_VPORT_ZMIN_14                                    0x028340
+#define R_028344_PA_SC_VPORT_ZMAX_14                                    0x028344
+#define R_028348_PA_SC_VPORT_ZMIN_15                                    0x028348
+#define R_02834C_PA_SC_VPORT_ZMAX_15                                    0x02834C
 #define R_028350_PA_SC_RASTER_CONFIG                                    0x028350
 #define   S_028350_RB_MAP_PKR0(x)                                     (((x) & 0x03) << 0)
 #define   G_028350_RB_MAP_PKR0(x)                                     (((x) >> 0) & 0x03)
@@ -5834,6 +7602,13 @@
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_16_WIDE_TILE        0x01
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_32_WIDE_TILE        0x02
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_64_WIDE_TILE        0x03
+#define R_028358_PA_SC_SCREEN_EXTENT_CONTROL                            0x028358
+#define   S_028358_SLICE_EVEN_ENABLE(x)                               (((x) & 0x03) << 0)
+#define   G_028358_SLICE_EVEN_ENABLE(x)                               (((x) >> 0) & 0x03)
+#define   C_028358_SLICE_EVEN_ENABLE                                  0xFFFFFFFC
+#define   S_028358_SLICE_ODD_ENABLE(x)                                (((x) & 0x03) << 2)
+#define   G_028358_SLICE_ODD_ENABLE(x)                                (((x) >> 2) & 0x03)
+#define   C_028358_SLICE_ODD_ENABLE                                   0xFFFFFFF3
 /*     */
 #define R_028400_VGT_MAX_VTX_INDX                                       0x028400
 #define R_028404_VGT_MIN_VTX_INDX                                       0x028404
@@ -5843,6 +7618,18 @@
 #define R_028418_CB_BLEND_GREEN                                         0x028418
 #define R_02841C_CB_BLEND_BLUE                                          0x02841C
 #define R_028420_CB_BLEND_ALPHA                                         0x028420
+/* VI */
+#define R_028424_CB_DCC_CONTROL                                         0x028424
+#define   S_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) & 0x1) << 1)
+#define   G_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) >> 1) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE             0xFFFFFFFD
+#define   S_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) & 0x1F) << 2)
+#define   G_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) >> 2) & 0x1F)
+#define   C_028424_OVERWRITE_COMBINER_WATERMARK                       0xFFFFFF83
+/*    */
 #define R_02842C_DB_STENCIL_CONTROL                                     0x02842C
 #define   S_02842C_STENCILFAIL(x)                                     (((x) & 0x0F) << 0)
 #define   G_02842C_STENCILFAIL(x)                                     (((x) >> 0) & 0x0F)
@@ -5984,12 +7771,102 @@
 #define   S_028434_STENCILOPVAL_BF(x)                                 (((x) & 0xFF) << 24)
 #define   G_028434_STENCILOPVAL_BF(x)                                 (((x) >> 24) & 0xFF)
 #define   C_028434_STENCILOPVAL_BF                                    0x00FFFFFF
-#define R_02843C_PA_CL_VPORT_XSCALE_0                                   0x02843C
-#define R_028440_PA_CL_VPORT_XOFFSET_0                                  0x028440
-#define R_028444_PA_CL_VPORT_YSCALE_0                                   0x028444
-#define R_028448_PA_CL_VPORT_YOFFSET_0                                  0x028448
-#define R_02844C_PA_CL_VPORT_ZSCALE_0                                   0x02844C
-#define R_028450_PA_CL_VPORT_ZOFFSET_0                                  0x028450
+#define R_02843C_PA_CL_VPORT_XSCALE                                     0x02843C
+#define R_028440_PA_CL_VPORT_XOFFSET                                    0x028440
+#define R_028444_PA_CL_VPORT_YSCALE                                     0x028444
+#define R_028448_PA_CL_VPORT_YOFFSET                                    0x028448
+#define R_02844C_PA_CL_VPORT_ZSCALE                                     0x02844C
+#define R_028450_PA_CL_VPORT_ZOFFSET                                    0x028450
+#define R_028454_PA_CL_VPORT_XSCALE_1                                   0x028454
+#define R_028458_PA_CL_VPORT_XOFFSET_1                                  0x028458
+#define R_02845C_PA_CL_VPORT_YSCALE_1                                   0x02845C
+#define R_028460_PA_CL_VPORT_YOFFSET_1                                  0x028460
+#define R_028464_PA_CL_VPORT_ZSCALE_1                                   0x028464
+#define R_028468_PA_CL_VPORT_ZOFFSET_1                                  0x028468
+#define R_02846C_PA_CL_VPORT_XSCALE_2                                   0x02846C
+#define R_028470_PA_CL_VPORT_XOFFSET_2                                  0x028470
+#define R_028474_PA_CL_VPORT_YSCALE_2                                   0x028474
+#define R_028478_PA_CL_VPORT_YOFFSET_2                                  0x028478
+#define R_02847C_PA_CL_VPORT_ZSCALE_2                                   0x02847C
+#define R_028480_PA_CL_VPORT_ZOFFSET_2                                  0x028480
+#define R_028484_PA_CL_VPORT_XSCALE_3                                   0x028484
+#define R_028488_PA_CL_VPORT_XOFFSET_3                                  0x028488
+#define R_02848C_PA_CL_VPORT_YSCALE_3                                   0x02848C
+#define R_028490_PA_CL_VPORT_YOFFSET_3                                  0x028490
+#define R_028494_PA_CL_VPORT_ZSCALE_3                                   0x028494
+#define R_028498_PA_CL_VPORT_ZOFFSET_3                                  0x028498
+#define R_02849C_PA_CL_VPORT_XSCALE_4                                   0x02849C
+#define R_0284A0_PA_CL_VPORT_XOFFSET_4                                  0x0284A0
+#define R_0284A4_PA_CL_VPORT_YSCALE_4                                   0x0284A4
+#define R_0284A8_PA_CL_VPORT_YOFFSET_4                                  0x0284A8
+#define R_0284AC_PA_CL_VPORT_ZSCALE_4                                   0x0284AC
+#define R_0284B0_PA_CL_VPORT_ZOFFSET_4                                  0x0284B0
+#define R_0284B4_PA_CL_VPORT_XSCALE_5                                   0x0284B4
+#define R_0284B8_PA_CL_VPORT_XOFFSET_5                                  0x0284B8
+#define R_0284BC_PA_CL_VPORT_YSCALE_5                                   0x0284BC
+#define R_0284C0_PA_CL_VPORT_YOFFSET_5                                  0x0284C0
+#define R_0284C4_PA_CL_VPORT_ZSCALE_5                                   0x0284C4
+#define R_0284C8_PA_CL_VPORT_ZOFFSET_5                                  0x0284C8
+#define R_0284CC_PA_CL_VPORT_XSCALE_6                                   0x0284CC
+#define R_0284D0_PA_CL_VPORT_XOFFSET_6                                  0x0284D0
+#define R_0284D4_PA_CL_VPORT_YSCALE_6                                   0x0284D4
+#define R_0284D8_PA_CL_VPORT_YOFFSET_6                                  0x0284D8
+#define R_0284DC_PA_CL_VPORT_ZSCALE_6                                   0x0284DC
+#define R_0284E0_PA_CL_VPORT_ZOFFSET_6                                  0x0284E0
+#define R_0284E4_PA_CL_VPORT_XSCALE_7                                   0x0284E4
+#define R_0284E8_PA_CL_VPORT_XOFFSET_7                                  0x0284E8
+#define R_0284EC_PA_CL_VPORT_YSCALE_7                                   0x0284EC
+#define R_0284F0_PA_CL_VPORT_YOFFSET_7                                  0x0284F0
+#define R_0284F4_PA_CL_VPORT_ZSCALE_7                                   0x0284F4
+#define R_0284F8_PA_CL_VPORT_ZOFFSET_7                                  0x0284F8
+#define R_0284FC_PA_CL_VPORT_XSCALE_8                                   0x0284FC
+#define R_028500_PA_CL_VPORT_XOFFSET_8                                  0x028500
+#define R_028504_PA_CL_VPORT_YSCALE_8                                   0x028504
+#define R_028508_PA_CL_VPORT_YOFFSET_8                                  0x028508
+#define R_02850C_PA_CL_VPORT_ZSCALE_8                                   0x02850C
+#define R_028510_PA_CL_VPORT_ZOFFSET_8                                  0x028510
+#define R_028514_PA_CL_VPORT_XSCALE_9                                   0x028514
+#define R_028518_PA_CL_VPORT_XOFFSET_9                                  0x028518
+#define R_02851C_PA_CL_VPORT_YSCALE_9                                   0x02851C
+#define R_028520_PA_CL_VPORT_YOFFSET_9                                  0x028520
+#define R_028524_PA_CL_VPORT_ZSCALE_9                                   0x028524
+#define R_028528_PA_CL_VPORT_ZOFFSET_9                                  0x028528
+#define R_02852C_PA_CL_VPORT_XSCALE_10                                  0x02852C
+#define R_028530_PA_CL_VPORT_XOFFSET_10                                 0x028530
+#define R_028534_PA_CL_VPORT_YSCALE_10                                  0x028534
+#define R_028538_PA_CL_VPORT_YOFFSET_10                                 0x028538
+#define R_02853C_PA_CL_VPORT_ZSCALE_10                                  0x02853C
+#define R_028540_PA_CL_VPORT_ZOFFSET_10                                 0x028540
+#define R_028544_PA_CL_VPORT_XSCALE_11                                  0x028544
+#define R_028548_PA_CL_VPORT_XOFFSET_11                                 0x028548
+#define R_02854C_PA_CL_VPORT_YSCALE_11                                  0x02854C
+#define R_028550_PA_CL_VPORT_YOFFSET_11                                 0x028550
+#define R_028554_PA_CL_VPORT_ZSCALE_11                                  0x028554
+#define R_028558_PA_CL_VPORT_ZOFFSET_11                                 0x028558
+#define R_02855C_PA_CL_VPORT_XSCALE_12                                  0x02855C
+#define R_028560_PA_CL_VPORT_XOFFSET_12                                 0x028560
+#define R_028564_PA_CL_VPORT_YSCALE_12                                  0x028564
+#define R_028568_PA_CL_VPORT_YOFFSET_12                                 0x028568
+#define R_02856C_PA_CL_VPORT_ZSCALE_12                                  0x02856C
+#define R_028570_PA_CL_VPORT_ZOFFSET_12                                 0x028570
+#define R_028574_PA_CL_VPORT_XSCALE_13                                  0x028574
+#define R_028578_PA_CL_VPORT_XOFFSET_13                                 0x028578
+#define R_02857C_PA_CL_VPORT_YSCALE_13                                  0x02857C
+#define R_028580_PA_CL_VPORT_YOFFSET_13                                 0x028580
+#define R_028584_PA_CL_VPORT_ZSCALE_13                                  0x028584
+#define R_028588_PA_CL_VPORT_ZOFFSET_13                                 0x028588
+#define R_02858C_PA_CL_VPORT_XSCALE_14                                  0x02858C
+#define R_028590_PA_CL_VPORT_XOFFSET_14                                 0x028590
+#define R_028594_PA_CL_VPORT_YSCALE_14                                  0x028594
+#define R_028598_PA_CL_VPORT_YOFFSET_14                                 0x028598
+#define R_02859C_PA_CL_VPORT_ZSCALE_14                                  0x02859C
+#define R_0285A0_PA_CL_VPORT_ZOFFSET_14                                 0x0285A0
+#define R_0285A4_PA_CL_VPORT_XSCALE_15                                  0x0285A4
+#define R_0285A8_PA_CL_VPORT_XOFFSET_15                                 0x0285A8
+#define R_0285AC_PA_CL_VPORT_YSCALE_15                                  0x0285AC
+#define R_0285B0_PA_CL_VPORT_YOFFSET_15                                 0x0285B0
+#define R_0285B4_PA_CL_VPORT_ZSCALE_15                                  0x0285B4
+#define R_0285B8_PA_CL_VPORT_ZOFFSET_15                                 0x0285B8
 #define R_0285BC_PA_CL_UCP_0_X                                          0x0285BC
 #define R_0285C0_PA_CL_UCP_0_Y                                          0x0285C0
 #define R_0285C4_PA_CL_UCP_0_Z                                          0x0285C4
@@ -6036,6 +7913,26 @@
 #define   G_028644_DUP(x)                                             (((x) >> 18) & 0x1)
 #define   C_028644_DUP                                                0xFFFBFFFF
 /*     */
+/* VI */
+#define   S_028644_FP16_INTERP_MODE(x)                                (((x) & 0x1) << 19)
+#define   G_028644_FP16_INTERP_MODE(x)                                (((x) >> 19) & 0x1)
+#define   C_028644_FP16_INTERP_MODE                                   0xFFF7FFFF
+#define   S_028644_USE_DEFAULT_ATTR1(x)                               (((x) & 0x1) << 20)
+#define   G_028644_USE_DEFAULT_ATTR1(x)                               (((x) >> 20) & 0x1)
+#define   C_028644_USE_DEFAULT_ATTR1                                  0xFFEFFFFF
+#define   S_028644_DEFAULT_VAL_ATTR1(x)                               (((x) & 0x03) << 21)
+#define   G_028644_DEFAULT_VAL_ATTR1(x)                               (((x) >> 21) & 0x03)
+#define   C_028644_DEFAULT_VAL_ATTR1                                  0xFF9FFFFF
+#define   S_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) & 0x1) << 23)
+#define   G_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) >> 23) & 0x1)
+#define   C_028644_PT_SPRITE_TEX_ATTR1                                0xFF7FFFFF
+#define   S_028644_ATTR0_VALID(x)                                     (((x) & 0x1) << 24)
+#define   G_028644_ATTR0_VALID(x)                                     (((x) >> 24) & 0x1)
+#define   C_028644_ATTR0_VALID                                        0xFEFFFFFF
+#define   S_028644_ATTR1_VALID(x)                                     (((x) & 0x1) << 25)
+#define   G_028644_ATTR1_VALID(x)                                     (((x) >> 25) & 0x1)
+#define   C_028644_ATTR1_VALID                                        0xFDFFFFFF
+/*    */
 #define R_028648_SPI_PS_INPUT_CNTL_1                                    0x028648
 #define R_02864C_SPI_PS_INPUT_CNTL_2                                    0x02864C
 #define R_028650_SPI_PS_INPUT_CNTL_3                                    0x028650
@@ -6559,6 +8456,10 @@
 #define R_028794_CB_BLEND5_CONTROL                                      0x028794
 #define R_028798_CB_BLEND6_CONTROL                                      0x028798
 #define R_02879C_CB_BLEND7_CONTROL                                      0x02879C
+#define R_0287CC_CS_COPY_STATE                                          0x0287CC
+#define   S_0287CC_SRC_STATE_ID(x)                                    (((x) & 0x07) << 0)
+#define   G_0287CC_SRC_STATE_ID(x)                                    (((x) >> 0) & 0x07)
+#define   C_0287CC_SRC_STATE_ID                                       0xFFFFFFF8
 #define R_0287D4_PA_CL_POINT_X_RAD                                      0x0287D4
 #define R_0287D8_PA_CL_POINT_Y_RAD                                      0x0287D8
 #define R_0287DC_PA_CL_POINT_SIZE                                       0x0287DC
@@ -6588,6 +8489,10 @@
 #define   G_0287F0_USE_OPAQUE(x)                                      (((x) >> 6) & 0x1)
 #define   C_0287F0_USE_OPAQUE                                         0xFFFFFFBF
 #define R_0287F4_VGT_IMMED_DATA                                         0x0287F4 /* not on CIK */
+#define R_0287F8_VGT_EVENT_ADDRESS_REG                                  0x0287F8
+#define   S_0287F8_ADDRESS_LOW(x)                                     (((x) & 0xFFFFFFF) << 0)
+#define   G_0287F8_ADDRESS_LOW(x)                                     (((x) >> 0) & 0xFFFFFFF)
+#define   C_0287F8_ADDRESS_LOW                                        0xF0000000
 #define R_028800_DB_DEPTH_CONTROL                                       0x028800
 #define   S_028800_STENCIL_ENABLE(x)                                  (((x) & 0x1) << 0)
 #define   G_028800_STENCIL_ENABLE(x)                                  (((x) >> 0) & 0x1)
@@ -6644,36 +8549,42 @@
 #define   G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x)              (((x) >> 31) & 0x1)
 #define   C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS                 0x7FFFFFFF
 #define R_028804_DB_EQAA                                                0x028804
-#define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
-#define   G_028804_MAX_ANCHOR_SAMPLES(x)		(((x) >> 0) & 0x7)
-#define   C_028804_MAX_ANCHOR_SAMPLES			(~(((~0) & 0x7) << 0))
-#define   S_028804_PS_ITER_SAMPLES(x)			(((x) & 0x7) << 4)
-#define   G_028804_PS_ITER_SAMPLES(x)			(((x) >> 4) & 0x7)
-#define   C_028804_PS_ITER_SAMPLES			(~(((~0) & 0x7) << 4))
-#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) & 0x7) << 8)
-#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) >> 8) & 0x7)
-#define   C_028804_MASK_EXPORT_NUM_SAMPLES		(~(((~0) & 0x7) << 8))
-#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) & 0x7) << 12)
-#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) >> 12) & 0x7)
-#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES		(~(((~0) & 0x7) << 12))
-#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) & 0x1) << 16)
-#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) >> 16) & 0x1)
-#define   C_028804_HIGH_QUALITY_INTERSECTIONS		(~(((~0) & 0x1) << 16))
-#define   S_028804_INCOHERENT_EQAA_READS(x)		(((x) & 0x1) << 17)
-#define   G_028804_INCOHERENT_EQAA_READS(x)		(((x) >> 17) & 0x1)
-#define   C_028804_INCOHERENT_EQAA_READS		(~(((~0) & 0x1) << 17))
-#define   S_028804_INTERPOLATE_COMP_Z(x)		(((x) & 0x1) << 18)
-#define   G_028804_INTERPOLATE_COMP_Z(x)		(((x) >> 18) & 0x1)
-#define   C_028804_INTERPOLATE_COMP_Z			(~(((~0) >> 18) & 0x1))
-#define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
-#define   G_028804_INTERPOLATE_SRC_Z(x)			(((x) >> 19) & 0x1)
-#define   C_028804_INTERPOLATE_SRC_Z			(~(((~0) & 0x1) << 19))
-#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
-#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) >> 20) & 0x1)
-#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS		(~(((~0) & 0x1) << 20))
-#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) >> 21) & 0x1)
-#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE		(~(((~0) & 0x1) << 21))
+#define   S_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) & 0x7) << 0)
+#define   G_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) >> 0) & 0x07)
+#define   C_028804_MAX_ANCHOR_SAMPLES                                 0xFFFFFFF8
+#define   S_028804_PS_ITER_SAMPLES(x)                                 (((x) & 0x7) << 4)
+#define   G_028804_PS_ITER_SAMPLES(x)                                 (((x) >> 4) & 0x07)
+#define   C_028804_PS_ITER_SAMPLES                                    0xFFFFFF8F
+#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) & 0x7) << 8)
+#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) >> 8) & 0x07)
+#define   C_028804_MASK_EXPORT_NUM_SAMPLES                            0xFFFFF8FF
+#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) & 0x7) << 12)
+#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) >> 12) & 0x07)
+#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES                          0xFFFF8FFF
+#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) & 0x1) << 16)
+#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) >> 16) & 0x1)
+#define   C_028804_HIGH_QUALITY_INTERSECTIONS                         0xFFFEFFFF
+#define   S_028804_INCOHERENT_EQAA_READS(x)                           (((x) & 0x1) << 17)
+#define   G_028804_INCOHERENT_EQAA_READS(x)                           (((x) >> 17) & 0x1)
+#define   C_028804_INCOHERENT_EQAA_READS                              0xFFFDFFFF
+#define   S_028804_INTERPOLATE_COMP_Z(x)                              (((x) & 0x1) << 18)
+#define   G_028804_INTERPOLATE_COMP_Z(x)                              (((x) >> 18) & 0x1)
+#define   C_028804_INTERPOLATE_COMP_Z                                 0xFFFBFFFF
+#define   S_028804_INTERPOLATE_SRC_Z(x)                               (((x) & 0x1) << 19)
+#define   G_028804_INTERPOLATE_SRC_Z(x)                               (((x) >> 19) & 0x1)
+#define   C_028804_INTERPOLATE_SRC_Z                                  0xFFF7FFFF
+#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) & 0x1) << 20)
+#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) >> 20) & 0x1)
+#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS                         0xFFEFFFFF
+#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) & 0x1) << 21)
+#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) >> 21) & 0x1)
+#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE                         0xFFDFFFFF
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) & 0x07) << 24)
+#define   G_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) >> 24) & 0x07)
+#define   C_028804_OVERRASTERIZATION_AMOUNT                           0xF8FFFFFF
+#define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) & 0x1) << 27)
+#define   G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) >> 27) & 0x1)
+#define   C_028804_ENABLE_POSTZ_OVERRASTERIZATION                     0xF7FFFFFF
 #define R_028808_CB_COLOR_CONTROL                                       0x028808
 #define   S_028808_DEGAMMA_ENABLE(x)                                  (((x) & 0x1) << 3)
 #define   G_028808_DEGAMMA_ENABLE(x)                                  (((x) >> 3) & 0x1)
@@ -6977,6 +8888,11 @@
 #define   S_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) & 0x1) << 25)
 #define   G_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) >> 25) & 0x1)
 #define   C_02881C_USE_VTX_GS_CUT_FLAG                                0xFDFFFFFF
+/* VI */
+#define   S_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) & 0x1) << 26)
+#define   G_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) >> 26) & 0x1)
+#define   C_02881C_USE_VTX_LINE_WIDTH                                 0xFBFFFFFF
+/*    */
 #define R_028820_PA_CL_NANINF_CNTL                                      0x028820
 #define   S_028820_VTE_XY_INF_DISCARD(x)                              (((x) & 0x1) << 0)
 #define   G_028820_VTE_XY_INF_DISCARD(x)                              (((x) >> 0) & 0x1)
@@ -7447,9 +9363,21 @@
 #define   S_028A4C_PS_ITER_SAMPLE(x)                                  (((x) & 0x1) << 16)
 #define   G_028A4C_PS_ITER_SAMPLE(x)                                  (((x) >> 16) & 0x1)
 #define   C_028A4C_PS_ITER_SAMPLE                                     0xFFFEFFFF
-#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) & 0x1) << 17)
-#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) >> 17) & 0x1)
-#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC                      0xFFFDFFFF
+#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) & 0x1) << 17)
+#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) >> 17) & 0x1)
+#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE            0xFFFDFFFF
+#define   S_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) & 0x1) << 18)
+#define   G_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) >> 18) & 0x1)
+#define   C_028A4C_MULTI_GPU_SUPERTILE_ENABLE                         0xFFFBFFFF
+#define   S_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) & 0x1) << 19)
+#define   G_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) >> 19) & 0x1)
+#define   C_028A4C_GPU_ID_OVERRIDE_ENABLE                             0xFFF7FFFF
+#define   S_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) & 0x0F) << 20)
+#define   G_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) >> 20) & 0x0F)
+#define   C_028A4C_GPU_ID_OVERRIDE                                    0xFF0FFFFF
+#define   S_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) & 0x1) << 24)
+#define   G_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) >> 24) & 0x1)
+#define   C_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE                      0xFEFFFFFF
 #define   S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) & 0x1) << 25)
 #define   G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) >> 25) & 0x1)
 #define   C_028A4C_FORCE_EOV_CNTDWN_ENABLE                            0xFDFFFFFF
@@ -7515,6 +9443,7 @@
 #define   C_028A7C_INDEX_TYPE                                         0xFFFFFFFC
 #define     V_028A7C_VGT_INDEX_16                                   0x00
 #define     V_028A7C_VGT_INDEX_32                                   0x01
+#define     V_028A7C_VGT_INDEX_8                                    0x02 /* VI */
 #define   S_028A7C_SWAP_MODE(x)                                       (((x) & 0x03) << 2)
 #define   G_028A7C_SWAP_MODE(x)                                       (((x) >> 2) & 0x03)
 #define   C_028A7C_SWAP_MODE                                          0xFFFFFFF3
@@ -7544,6 +9473,12 @@
 #define   G_028A7C_REQ_PATH(x)                                        (((x) >> 10) & 0x1)
 #define   C_028A7C_REQ_PATH                                           0xFFFFFBFF
 /*     */
+/* VI */
+#define   S_028A7C_MTYPE(x)                                           (((x) & 0x03) << 11)
+#define   G_028A7C_MTYPE(x)                                           (((x) >> 11) & 0x03)
+#define   C_028A7C_MTYPE                                              0xFFFFE7FF
+/*    */
+#define R_028A80_WD_ENHANCE                                             0x028A80
 #define R_028A84_VGT_PRIMITIVEID_EN                                     0x028A84
 #define   S_028A84_PRIMITIVEID_EN(x)                                  (((x) & 0x1) << 0)
 #define   G_028A84_PRIMITIVEID_EN(x)                                  (((x) >> 0) & 0x1)
@@ -7642,6 +9577,10 @@
 #define   S_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) & 0x1) << 20)
 #define   G_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) >> 20) & 0x1)
 #define   C_028AA8_WD_SWITCH_ON_EOP                                   0xFFEFFFFF
+/* VI */
+#define   S_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) & 0x0F) << 28)
+#define   G_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) >> 28) & 0x0F)
+#define   C_028AA8_MAX_PRIMGRP_IN_WAVE                                0x0FFFFFFF
 /*     */
 #define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
 #define   S_028AAC_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
@@ -7681,6 +9620,11 @@
 #define   S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) & 0x1) << 16)
 #define   G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) >> 16) & 0x1)
 #define   C_028ABC_DST_OUTSIDE_ZERO_TO_ONE                            0xFFFEFFFF
+/* VI */
+#define   S_028ABC_TC_COMPATIBLE(x)                                   (((x) & 0x1) << 17)
+#define   G_028ABC_TC_COMPATIBLE(x)                                   (((x) >> 17) & 0x1)
+#define   C_028ABC_TC_COMPATIBLE                                      0xFFFDFFFF
+/*    */
 #define R_028AC0_DB_SRESULTS_COMPARE_STATE0                             0x028AC0
 #define   S_028AC0_COMPAREFUNC0(x)                                    (((x) & 0x07) << 0)
 #define   G_028AC0_COMPAREFUNC0(x)                                    (((x) >> 0) & 0x07)
@@ -7770,6 +9714,21 @@
 #define   S_028B38_MAX_VERT_OUT(x)                                    (((x) & 0x7FF) << 0)
 #define   G_028B38_MAX_VERT_OUT(x)                                    (((x) >> 0) & 0x7FF)
 #define   C_028B38_MAX_VERT_OUT                                       0xFFFFF800
+/* VI */
+#define R_028B50_VGT_TESS_DISTRIBUTION                                  0x028B50
+#define   S_028B50_ACCUM_ISOLINE(x)                                   (((x) & 0xFF) << 0)
+#define   G_028B50_ACCUM_ISOLINE(x)                                   (((x) >> 0) & 0xFF)
+#define   C_028B50_ACCUM_ISOLINE                                      0xFFFFFF00
+#define   S_028B50_ACCUM_TRI(x)                                       (((x) & 0xFF) << 8)
+#define   G_028B50_ACCUM_TRI(x)                                       (((x) >> 8) & 0xFF)
+#define   C_028B50_ACCUM_TRI                                          0xFFFF00FF
+#define   S_028B50_ACCUM_QUAD(x)                                      (((x) & 0xFF) << 16)
+#define   G_028B50_ACCUM_QUAD(x)                                      (((x) >> 16) & 0xFF)
+#define   C_028B50_ACCUM_QUAD                                         0xFF00FFFF
+#define   S_028B50_DONUT_SPLIT(x)                                     (((x) & 0xFF) << 24)
+#define   G_028B50_DONUT_SPLIT(x)                                     (((x) >> 24) & 0xFF)
+#define   C_028B50_DONUT_SPLIT                                        0x00FFFFFF
+/*    */
 #define R_028B54_VGT_SHADER_STAGES_EN                                   0x028B54
 #define   S_028B54_LS_EN(x)                                           (((x) & 0x03) << 0)
 #define   G_028B54_LS_EN(x)                                           (((x) >> 0) & 0x03)
@@ -7798,6 +9757,20 @@
 #define   S_028B54_DYNAMIC_HS(x)                                      (((x) & 0x1) << 8)
 #define   G_028B54_DYNAMIC_HS(x)                                      (((x) >> 8) & 0x1)
 #define   C_028B54_DYNAMIC_HS                                         0xFFFFFEFF
+/* VI */
+#define   S_028B54_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 9)
+#define   G_028B54_DISPATCH_DRAW_EN(x)                                (((x) >> 9) & 0x1)
+#define   C_028B54_DISPATCH_DRAW_EN                                   0xFFFFFDFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) & 0x1) << 10)
+#define   G_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) >> 10) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_0                                0xFFFFFBFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) & 0x1) << 11)
+#define   G_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) >> 11) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_1                                0xFFFFF7FF
+#define   S_028B54_VS_WAVE_ID_EN(x)                                   (((x) & 0x1) << 12)
+#define   G_028B54_VS_WAVE_ID_EN(x)                                   (((x) >> 12) & 0x1)
+#define   C_028B54_VS_WAVE_ID_EN                                      0xFFFFEFFF
+/*    */
 #define R_028B58_VGT_LS_HS_CONFIG                                       0x028B58
 #define   S_028B58_NUM_PATCHES(x)                                     (((x) & 0xFF) << 0)
 #define   G_028B58_NUM_PATCHES(x)                                     (((x) >> 0) & 0xFF)
@@ -7848,6 +9821,9 @@
 #define   S_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) & 0x1) << 8) /* not on CIK */
 #define   G_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) >> 8) & 0x1) /* not on CIK */
 #define   C_028B6C_RESERVED_REDUC_AXIS                                0xFFFFFEFF /* not on CIK */
+#define   S_028B6C_DEPRECATED(x)                                      (((x) & 0x1) << 9)
+#define   G_028B6C_DEPRECATED(x)                                      (((x) >> 9) & 0x1)
+#define   C_028B6C_DEPRECATED                                         0xFFFFFDFF
 #define   S_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) & 0x0F) << 10)
 #define   G_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) >> 10) & 0x0F)
 #define   C_028B6C_NUM_DS_WAVES_PER_SIMD                              0xFFFFC3FF
@@ -7862,6 +9838,14 @@
 #define     V_028B6C_VGT_POLICY_STREAM                              0x01
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
+/* VI */
+#define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
+#define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
+#define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
+#define   S_028B6C_MTYPE(x)                                           (((x) & 0x03) << 19)
+#define   G_028B6C_MTYPE(x)                                           (((x) >> 19) & 0x03)
+#define   C_028B6C_MTYPE                                              0xFFE7FFFF
+/*    */
 #define R_028B70_DB_ALPHA_TO_MASK                                       0x028B70
 #define   S_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) & 0x1) << 0)
 #define   G_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) >> 0) & 0x1)
@@ -8001,6 +9985,22 @@
 #define   S_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) & 0x1) << 12)
 #define   G_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) >> 12) & 0x1)
 #define   C_028BDC_DX10_DIAMOND_TEST_ENA                              0xFFFFEFFF
+#define R_028BE0_PA_SC_AA_CONFIG                                        0x028BE0
+#define   S_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) & 0x7) << 0)
+#define   G_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) >> 0) & 0x07)
+#define   C_028BE0_MSAA_NUM_SAMPLES                                   0xFFFFFFF8
+#define   S_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) & 0x1) << 4)
+#define   G_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) >> 4) & 0x1)
+#define   C_028BE0_AA_MASK_CENTROID_DTMN                              0xFFFFFFEF
+#define   S_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) & 0xf) << 13)
+#define   G_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) >> 13) & 0x0F)
+#define   C_028BE0_MAX_SAMPLE_DIST                                    0xFFFE1FFF
+#define   S_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) & 0x7) << 20)
+#define   G_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) >> 20) & 0x07)
+#define   C_028BE0_MSAA_EXPOSED_SAMPLES                               0xFF8FFFFF
+#define   S_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) & 0x3) << 24)
+#define   G_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) >> 24) & 0x03)
+#define   C_028BE0_DETAIL_TO_EXPOSED_MODE                             0xFCFFFFFF
 #define R_028BE4_PA_SU_VTX_CNTL                                         0x028BE4
 #define   S_028BE4_PIX_CENTER(x)                                      (((x) & 0x1) << 0)
 #define   G_028BE4_PIX_CENTER(x)                                      (((x) >> 0) & 0x1)
@@ -8569,6 +10569,17 @@
 #define   G_028C70_FMASK_COMPRESSION_DISABLE(x)                       (((x) >> 26) & 0x1)
 #define   C_028C70_FMASK_COMPRESSION_DISABLE                          0xFBFFFFFF
 /*     */
+/* VI */
+#define   S_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) & 0x1) << 27)
+#define   G_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) >> 27) & 0x1)
+#define   C_028C70_FMASK_COMPRESS_1FRAG_ONLY                          0xF7FFFFFF
+#define   S_028C70_DCC_ENABLE(x)                                      (((x) & 0x1) << 28)
+#define   G_028C70_DCC_ENABLE(x)                                      (((x) >> 28) & 0x1)
+#define   C_028C70_DCC_ENABLE                                         0xEFFFFFFF
+#define   S_028C70_CMASK_ADDR_TYPE(x)                                 (((x) & 0x03) << 29)
+#define   G_028C70_CMASK_ADDR_TYPE(x)                                 (((x) >> 29) & 0x03)
+#define   C_028C70_CMASK_ADDR_TYPE                                    0x9FFFFFFF
+/*    */
 #define R_028C74_CB_COLOR0_ATTRIB                                       0x028C74
 #define   S_028C74_TILE_MODE_INDEX(x)                                 (((x) & 0x1F) << 0)
 #define   G_028C74_TILE_MODE_INDEX(x)                                 (((x) >> 0) & 0x1F)
@@ -8576,7 +10587,9 @@
 #define   S_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) & 0x1F) << 5)
 #define   G_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) >> 5) & 0x1F)
 #define   C_028C74_FMASK_TILE_MODE_INDEX                              0xFFFFFC1F
-#define   S_028C74_FMASK_BANK_HEIGHT(x)				      (((x) & 0x3) << 10) /* SI errata */
+#define   S_028C74_FMASK_BANK_HEIGHT(x)                               (((x) & 0x03) << 10)
+#define   G_028C74_FMASK_BANK_HEIGHT(x)                               (((x) >> 10) & 0x03)
+#define   C_028C74_FMASK_BANK_HEIGHT                                  0xFFFFF3FF
 #define   S_028C74_NUM_SAMPLES(x)                                     (((x) & 0x07) << 12)
 #define   G_028C74_NUM_SAMPLES(x)                                     (((x) >> 12) & 0x07)
 #define   C_028C74_NUM_SAMPLES                                        0xFFFF8FFF
@@ -8586,6 +10599,36 @@
 #define   S_028C74_FORCE_DST_ALPHA_1(x)                               (((x) & 0x1) << 17)
 #define   G_028C74_FORCE_DST_ALPHA_1(x)                               (((x) >> 17) & 0x1)
 #define   C_028C74_FORCE_DST_ALPHA_1                                  0xFFFDFFFF
+/* VI */
+#define R_028C78_CB_COLOR0_DCC_CONTROL                                  0x028C78
+#define   S_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028C78_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028C78_KEY_CLEAR_ENABLE(x)                                (((x) & 0x1) << 1)
+#define   G_028C78_KEY_CLEAR_ENABLE(x)                                (((x) >> 1) & 0x1)
+#define   C_028C78_KEY_CLEAR_ENABLE                                   0xFFFFFFFD
+#define   S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) & 0x03) << 2)
+#define   G_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) >> 2) & 0x03)
+#define   C_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE                        0xFFFFFFF3
+#define   S_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x1) << 4)
+#define   G_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 4) & 0x1)
+#define   C_028C78_MIN_COMPRESSED_BLOCK_SIZE                          0xFFFFFFEF
+#define   S_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x03) << 5)
+#define   G_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 5) & 0x03)
+#define   C_028C78_MAX_COMPRESSED_BLOCK_SIZE                          0xFFFFFF9F
+#define   S_028C78_COLOR_TRANSFORM(x)                                 (((x) & 0x03) << 7)
+#define   G_028C78_COLOR_TRANSFORM(x)                                 (((x) >> 7) & 0x03)
+#define   C_028C78_COLOR_TRANSFORM                                    0xFFFFFE7F
+#define   S_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) & 0x1) << 9)
+#define   G_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) >> 9) & 0x1)
+#define   C_028C78_INDEPENDENT_64B_BLOCKS                             0xFFFFFDFF
+#define   S_028C78_LOSSY_RGB_PRECISION(x)                             (((x) & 0x0F) << 10)
+#define   G_028C78_LOSSY_RGB_PRECISION(x)                             (((x) >> 10) & 0x0F)
+#define   C_028C78_LOSSY_RGB_PRECISION                                0xFFFFC3FF
+#define   S_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) & 0x0F) << 14)
+#define   G_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) >> 14) & 0x0F)
+#define   C_028C78_LOSSY_ALPHA_PRECISION                              0xFFFC3FFF
+/*    */
 #define R_028C7C_CB_COLOR0_CMASK                                        0x028C7C
 #define R_028C80_CB_COLOR0_CMASK_SLICE                                  0x028C80
 #define   S_028C80_TILE_MAX(x)                                        (((x) & 0x3FFF) << 0)
@@ -8598,90 +10641,105 @@
 #define   C_028C88_TILE_MAX                                           0xFFC00000
 #define R_028C8C_CB_COLOR0_CLEAR_WORD0                                  0x028C8C
 #define R_028C90_CB_COLOR0_CLEAR_WORD1                                  0x028C90
+#define R_028C94_CB_COLOR0_DCC_BASE                                     0x028C94 /* VI */
 #define R_028C9C_CB_COLOR1_BASE                                         0x028C9C
 #define R_028CA0_CB_COLOR1_PITCH                                        0x028CA0
 #define R_028CA4_CB_COLOR1_SLICE                                        0x028CA4
 #define R_028CA8_CB_COLOR1_VIEW                                         0x028CA8
 #define R_028CAC_CB_COLOR1_INFO                                         0x028CAC
 #define R_028CB0_CB_COLOR1_ATTRIB                                       0x028CB0
-#define R_028CD4_CB_COLOR1_CMASK                                        0x028CB8
+#define R_028CB4_CB_COLOR1_DCC_CONTROL                                  0x028CB4 /* VI */
+#define R_028CB8_CB_COLOR1_CMASK                                        0x028CB8
 #define R_028CBC_CB_COLOR1_CMASK_SLICE                                  0x028CBC
 #define R_028CC0_CB_COLOR1_FMASK                                        0x028CC0
 #define R_028CC4_CB_COLOR1_FMASK_SLICE                                  0x028CC4
 #define R_028CC8_CB_COLOR1_CLEAR_WORD0                                  0x028CC8
 #define R_028CCC_CB_COLOR1_CLEAR_WORD1                                  0x028CCC
+#define R_028CD0_CB_COLOR1_DCC_BASE                                     0x028CD0 /* VI */
 #define R_028CD8_CB_COLOR2_BASE                                         0x028CD8
 #define R_028CDC_CB_COLOR2_PITCH                                        0x028CDC
 #define R_028CE0_CB_COLOR2_SLICE                                        0x028CE0
 #define R_028CE4_CB_COLOR2_VIEW                                         0x028CE4
 #define R_028CE8_CB_COLOR2_INFO                                         0x028CE8
 #define R_028CEC_CB_COLOR2_ATTRIB                                       0x028CEC
+#define R_028CF0_CB_COLOR2_DCC_CONTROL                                  0x028CF0 /* VI */
 #define R_028CF4_CB_COLOR2_CMASK                                        0x028CF4
 #define R_028CF8_CB_COLOR2_CMASK_SLICE                                  0x028CF8
 #define R_028CFC_CB_COLOR2_FMASK                                        0x028CFC
 #define R_028D00_CB_COLOR2_FMASK_SLICE                                  0x028D00
 #define R_028D04_CB_COLOR2_CLEAR_WORD0                                  0x028D04
 #define R_028D08_CB_COLOR2_CLEAR_WORD1                                  0x028D08
+#define R_028D0C_CB_COLOR2_DCC_BASE                                     0x028D0C /* VI */
 #define R_028D14_CB_COLOR3_BASE                                         0x028D14
 #define R_028D18_CB_COLOR3_PITCH                                        0x028D18
 #define R_028D1C_CB_COLOR3_SLICE                                        0x028D1C
 #define R_028D20_CB_COLOR3_VIEW                                         0x028D20
 #define R_028D24_CB_COLOR3_INFO                                         0x028D24
 #define R_028D28_CB_COLOR3_ATTRIB                                       0x028D28
+#define R_028D2C_CB_COLOR3_DCC_CONTROL                                  0x028D2C /* VI */
 #define R_028D30_CB_COLOR3_CMASK                                        0x028D30
 #define R_028D34_CB_COLOR3_CMASK_SLICE                                  0x028D34
 #define R_028D38_CB_COLOR3_FMASK                                        0x028D38
 #define R_028D3C_CB_COLOR3_FMASK_SLICE                                  0x028D3C
 #define R_028D40_CB_COLOR3_CLEAR_WORD0                                  0x028D40
 #define R_028D44_CB_COLOR3_CLEAR_WORD1                                  0x028D44
+#define R_028D48_CB_COLOR3_DCC_BASE                                     0x028D48 /* VI */
 #define R_028D50_CB_COLOR4_BASE                                         0x028D50
 #define R_028D54_CB_COLOR4_PITCH                                        0x028D54
 #define R_028D58_CB_COLOR4_SLICE                                        0x028D58
 #define R_028D5C_CB_COLOR4_VIEW                                         0x028D5C
 #define R_028D60_CB_COLOR4_INFO                                         0x028D60
 #define R_028D64_CB_COLOR4_ATTRIB                                       0x028D64
+#define R_028D68_CB_COLOR4_DCC_CONTROL                                  0x028D68 /* VI */
 #define R_028D6C_CB_COLOR4_CMASK                                        0x028D6C
 #define R_028D70_CB_COLOR4_CMASK_SLICE                                  0x028D70
 #define R_028D74_CB_COLOR4_FMASK                                        0x028D74
 #define R_028D78_CB_COLOR4_FMASK_SLICE                                  0x028D78
 #define R_028D7C_CB_COLOR4_CLEAR_WORD0                                  0x028D7C
 #define R_028D80_CB_COLOR4_CLEAR_WORD1                                  0x028D80
+#define R_028D84_CB_COLOR4_DCC_BASE                                     0x028D84 /* VI */
 #define R_028D8C_CB_COLOR5_BASE                                         0x028D8C
 #define R_028D90_CB_COLOR5_PITCH                                        0x028D90
 #define R_028D94_CB_COLOR5_SLICE                                        0x028D94
 #define R_028D98_CB_COLOR5_VIEW                                         0x028D98
 #define R_028D9C_CB_COLOR5_INFO                                         0x028D9C
 #define R_028DA0_CB_COLOR5_ATTRIB                                       0x028DA0
+#define R_028DA4_CB_COLOR5_DCC_CONTROL                                  0x028DA4 /* VI */
 #define R_028DA8_CB_COLOR5_CMASK                                        0x028DA8
 #define R_028DAC_CB_COLOR5_CMASK_SLICE                                  0x028DAC
 #define R_028DB0_CB_COLOR5_FMASK                                        0x028DB0
 #define R_028DB4_CB_COLOR5_FMASK_SLICE                                  0x028DB4
 #define R_028DB8_CB_COLOR5_CLEAR_WORD0                                  0x028DB8
 #define R_028DBC_CB_COLOR5_CLEAR_WORD1                                  0x028DBC
+#define R_028DC0_CB_COLOR5_DCC_BASE                                     0x028DC0 /* VI */
 #define R_028DC8_CB_COLOR6_BASE                                         0x028DC8
 #define R_028DCC_CB_COLOR6_PITCH                                        0x028DCC
 #define R_028DD0_CB_COLOR6_SLICE                                        0x028DD0
 #define R_028DD4_CB_COLOR6_VIEW                                         0x028DD4
 #define R_028DD8_CB_COLOR6_INFO                                         0x028DD8
 #define R_028DDC_CB_COLOR6_ATTRIB                                       0x028DDC
+#define R_028DE0_CB_COLOR6_DCC_CONTROL                                  0x028DE0 /* VI */
 #define R_028DE4_CB_COLOR6_CMASK                                        0x028DE4
 #define R_028DE8_CB_COLOR6_CMASK_SLICE                                  0x028DE8
 #define R_028DEC_CB_COLOR6_FMASK                                        0x028DEC
 #define R_028DF0_CB_COLOR6_FMASK_SLICE                                  0x028DF0
 #define R_028DF4_CB_COLOR6_CLEAR_WORD0                                  0x028DF4
 #define R_028DF8_CB_COLOR6_CLEAR_WORD1                                  0x028DF8
+#define R_028DFC_CB_COLOR6_DCC_BASE                                     0x028DFC /* VI */
 #define R_028E04_CB_COLOR7_BASE                                         0x028E04
 #define R_028E08_CB_COLOR7_PITCH                                        0x028E08
 #define R_028E0C_CB_COLOR7_SLICE                                        0x028E0C
 #define R_028E10_CB_COLOR7_VIEW                                         0x028E10
 #define R_028E14_CB_COLOR7_INFO                                         0x028E14
 #define R_028E18_CB_COLOR7_ATTRIB                                       0x028E18
+#define R_028E1C_CB_COLOR7_DCC_CONTROL                                  0x028E1C /* VI */
 #define R_028E20_CB_COLOR7_CMASK                                        0x028E20
 #define R_028E24_CB_COLOR7_CMASK_SLICE                                  0x028E24
 #define R_028E28_CB_COLOR7_FMASK                                        0x028E28
 #define R_028E2C_CB_COLOR7_FMASK_SLICE                                  0x028E2C
 #define R_028E30_CB_COLOR7_CLEAR_WORD0                                  0x028E30
 #define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
+#define R_028E38_CB_COLOR7_DCC_BASE                                     0x028E38 /* VI */
 
 /* SI async DMA packets */
 #define SI_DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
diff --git a/src/gallium/drivers/rbug/rbug_context.h b/src/gallium/drivers/rbug/rbug_context.h
index 5e7b9d4dee4..e99f6edc523 100644
--- a/src/gallium/drivers/rbug/rbug_context.h
+++ b/src/gallium/drivers/rbug/rbug_context.h
@@ -79,7 +79,7 @@ struct rbug_context {
    struct rbug_list shaders;
 };
 
-static INLINE struct rbug_context *
+static inline struct rbug_context *
 rbug_context(struct pipe_context *pipe)
 {
    return (struct rbug_context *)pipe;
diff --git a/src/gallium/drivers/rbug/rbug_objects.h b/src/gallium/drivers/rbug/rbug_objects.h
index 3fba3334228..02973e07996 100644
--- a/src/gallium/drivers/rbug/rbug_objects.h
+++ b/src/gallium/drivers/rbug/rbug_objects.h
@@ -93,7 +93,7 @@ struct rbug_transfer
 };
 
 
-static INLINE struct rbug_resource *
+static inline struct rbug_resource *
 rbug_resource(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -102,7 +102,7 @@ rbug_resource(struct pipe_resource *_resource)
    return (struct rbug_resource *)_resource;
 }
 
-static INLINE struct rbug_sampler_view *
+static inline struct rbug_sampler_view *
 rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -111,7 +111,7 @@ rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
    return (struct rbug_sampler_view *)_sampler_view;
 }
 
-static INLINE struct rbug_surface *
+static inline struct rbug_surface *
 rbug_surface(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -120,7 +120,7 @@ rbug_surface(struct pipe_surface *_surface)
    return (struct rbug_surface *)_surface;
 }
 
-static INLINE struct rbug_transfer *
+static inline struct rbug_transfer *
 rbug_transfer(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -129,7 +129,7 @@ rbug_transfer(struct pipe_transfer *_transfer)
    return (struct rbug_transfer *)_transfer;
 }
 
-static INLINE struct rbug_shader *
+static inline struct rbug_shader *
 rbug_shader(void *_state)
 {
    if (!_state)
@@ -137,7 +137,7 @@ rbug_shader(void *_state)
    return (struct rbug_shader *)_state;
 }
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 rbug_resource_unwrap(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -145,7 +145,7 @@ rbug_resource_unwrap(struct pipe_resource *_resource)
    return rbug_resource(_resource)->resource;
 }
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -153,7 +153,7 @@ rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
    return rbug_sampler_view(_sampler_view)->sampler_view;
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 rbug_surface_unwrap(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -161,7 +161,7 @@ rbug_surface_unwrap(struct pipe_surface *_surface)
    return rbug_surface(_surface)->surface;
 }
 
-static INLINE struct pipe_transfer *
+static inline struct pipe_transfer *
 rbug_transfer_unwrap(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -169,7 +169,7 @@ rbug_transfer_unwrap(struct pipe_transfer *_transfer)
    return rbug_transfer(_transfer)->transfer;
 }
 
-static INLINE void *
+static inline void *
 rbug_shader_unwrap(void *_state)
 {
    struct rbug_shader *shader;
diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c
index d5a3164e217..7da4e81560a 100644
--- a/src/gallium/drivers/rbug/rbug_screen.c
+++ b/src/gallium/drivers/rbug/rbug_screen.c
@@ -226,17 +226,6 @@ rbug_screen_fence_reference(struct pipe_screen *_screen,
 }
 
 static boolean
-rbug_screen_fence_signalled(struct pipe_screen *_screen,
-                            struct pipe_fence_handle *fence)
-{
-   struct rbug_screen *rb_screen = rbug_screen(_screen);
-   struct pipe_screen *screen = rb_screen->screen;
-
-   return screen->fence_signalled(screen,
-                                  fence);
-}
-
-static boolean
 rbug_screen_fence_finish(struct pipe_screen *_screen,
                          struct pipe_fence_handle *fence,
                          uint64_t timeout)
@@ -288,7 +277,6 @@ rbug_screen_create(struct pipe_screen *screen)
    rb_screen->base.resource_destroy = rbug_screen_resource_destroy;
    rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer;
    rb_screen->base.fence_reference = rbug_screen_fence_reference;
-   rb_screen->base.fence_signalled = rbug_screen_fence_signalled;
    rb_screen->base.fence_finish = rbug_screen_fence_finish;
 
    rb_screen->screen = screen;
diff --git a/src/gallium/drivers/rbug/rbug_screen.h b/src/gallium/drivers/rbug/rbug_screen.h
index a53afac05e9..fd92374beda 100644
--- a/src/gallium/drivers/rbug/rbug_screen.h
+++ b/src/gallium/drivers/rbug/rbug_screen.h
@@ -60,7 +60,7 @@ struct rbug_screen
    struct rbug_list transfers;
 };
 
-static INLINE struct rbug_screen *
+static inline struct rbug_screen *
 rbug_screen(struct pipe_screen *screen)
 {
    return (struct rbug_screen *)screen;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 50a73369c1d..577df814b29 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -203,7 +203,7 @@ struct softpipe_context {
 };
 
 
-static INLINE struct softpipe_context *
+static inline struct softpipe_context *
 softpipe_context( struct pipe_context *pipe )
 {
    return (struct softpipe_context *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_fence.c b/src/gallium/drivers/softpipe/sp_fence.c
index c2897ed1ef8..6168236ec96 100644
--- a/src/gallium/drivers/softpipe/sp_fence.c
+++ b/src/gallium/drivers/softpipe/sp_fence.c
@@ -41,15 +41,6 @@ softpipe_fence_reference(struct pipe_screen *screen,
 
 
 static boolean
-softpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   assert(fence);
-   return TRUE;
-}
-
-
-static boolean
 softpipe_fence_finish(struct pipe_screen *screen,
                       struct pipe_fence_handle *fence,
                       uint64_t timeout)
@@ -64,5 +55,4 @@ softpipe_init_screen_fence_funcs(struct pipe_screen *screen)
 {
    screen->fence_reference = softpipe_fence_reference;
    screen->fence_finish = softpipe_fence_finish;
-   screen->fence_signalled = softpipe_fence_signalled;
 }
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 369ab6ed8d4..89411777ec9 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -52,7 +52,7 @@ struct sp_exec_fragment_shader
 
 
 /** cast wrapper */
-static INLINE struct sp_exec_fragment_shader *
+static inline struct sp_exec_fragment_shader *
 sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 {
    return (struct sp_exec_fragment_shader *) var;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 18eca611669..f8a3eacdb37 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -145,7 +145,7 @@ sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 }
 
 
-static INLINE cptrf4 get_vert( const void *vertex_buffer,
+static inline cptrf4 get_vert( const void *vertex_buffer,
                                int index,
                                int stride )
 {
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index a32bd7fd241..5b458450cd8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -63,7 +63,7 @@ struct blend_quad_stage
 
 
 /** cast wrapper */
-static INLINE struct blend_quad_stage *
+static inline struct blend_quad_stage *
 blend_quad_stage(struct quad_stage *stage)
 {
    return (struct blend_quad_stage *) stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 82c58d04527..395bc70f2cf 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -56,7 +56,7 @@ struct quad_shade_stage
 
 
 /** cast wrapper */
-static INLINE struct quad_shade_stage *
+static inline struct quad_shade_stage *
 quad_shade_stage(struct quad_stage *qs)
 {
    return (struct quad_shade_stage *) qs;
@@ -67,7 +67,7 @@ quad_shade_stage(struct quad_stage *qs)
  * Execute fragment shader for the four fragments in the quad.
  * \return TRUE if quad is alive, FALSE if all four pixels are killed
  */
-static INLINE boolean
+static inline boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index a688d319bb8..0bfd9c3578c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -234,6 +234,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
       return 1;
    case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
@@ -242,6 +244,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
index d39e9f48e80..f0e929111c2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.h
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -49,7 +49,7 @@ struct softpipe_screen {
    boolean use_llvm;
 };
 
-static INLINE struct softpipe_screen *
+static inline struct softpipe_screen *
 softpipe_screen( struct pipe_screen *pipe )
 {
    return (struct softpipe_screen *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 6704015112b..ff3cb9fe5e1 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -125,7 +125,7 @@ struct setup_context {
 /**
  * Clip setup->quad against the scissor/surface bounds.
  */
-static INLINE void
+static inline void
 quad_clip(struct setup_context *setup, struct quad_header *quad)
 {
    const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
@@ -156,7 +156,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad)
 /**
  * Emit a quad (pass to next stage) with clipping.
  */
-static INLINE void
+static inline void
 clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
 {
    quad_clip( setup, quad );
@@ -178,14 +178,14 @@ clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int
+static inline int
 block(int x)
 {
    return x & ~(2-1);
 }
 
 
-static INLINE int
+static inline int
 block_x(int x)
 {
    return x & ~(16-1);
@@ -1039,7 +1039,7 @@ setup_line_coefficients(struct setup_context *setup,
 /**
  * Plot a pixel in a line segment.
  */
-static INLINE void
+static inline void
 plot(struct setup_context *setup, int x, int y)
 {
    const int iy = y & 1;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 1010b63de2c..565fca632c6 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -58,7 +58,7 @@
  * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
  */
-static INLINE float
+static inline float
 frac(float f)
 {
    return f - floorf(f);
@@ -69,7 +69,7 @@ frac(float f)
 /**
  * Linear interpolation macro
  */
-static INLINE float
+static inline float
 lerp(float a, float v0, float v1)
 {
    return v0 + a * (v1 - v0);
@@ -84,7 +84,7 @@ lerp(float a, float v0, float v1)
  * optimization!  If we find that's not true on some systems, convert
  * to a macro.
  */
-static INLINE float
+static inline float
 lerp_2d(float a, float b,
         float v00, float v10, float v01, float v11)
 {
@@ -97,7 +97,7 @@ lerp_2d(float a, float b,
 /**
  * As above, but 3D interpolation of 8 values.
  */
-static INLINE float
+static inline float
 lerp_3d(float a, float b, float c,
         float v000, float v100, float v010, float v110,
         float v001, float v101, float v011, float v111)
@@ -115,7 +115,7 @@ lerp_3d(float a, float b, float c,
  * value.  To avoid that problem we add a large multiple of the size
  * (rather than using a conditional).
  */
-static INLINE int
+static inline int
 repeat(int coord, unsigned size)
 {
    return (coord + size * 1024) % size;
@@ -486,7 +486,7 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset,
 /**
  * Do coordinate to array index conversion.  For array textures.
  */
-static INLINE int
+static inline int
 coord_to_layer(float coord, unsigned first_layer, unsigned last_layer)
 {
    int c = util_ifloor(coord + 0.5F);
@@ -587,7 +587,7 @@ compute_lambda_vert(const struct sp_sampler_view *sview,
 
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y)
 {
@@ -603,7 +603,7 @@ get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y)
@@ -695,7 +695,7 @@ static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
      PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y }
 };
 
-static INLINE unsigned
+static inline unsigned
 get_next_face(unsigned face, int idx)
 {
    return face_array[face][idx];
@@ -705,7 +705,7 @@ get_next_face(unsigned face, int idx)
  * return a new xcoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((face == 0 && fall_off_index != 1) ||
@@ -743,7 +743,7 @@ get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
  * return a new ycoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((fall_off_index <= 1) && (face <= 1 || face >= 4)) {
@@ -771,7 +771,7 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 
 /* Gather a quad of adjacent texels within a tile:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
                                         union tex_tile_address addr,
                                         unsigned x, unsigned y,
@@ -795,7 +795,7 @@ get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
 
 /* Gather a quad of potentially non-adjacent texels:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
                             union tex_tile_address addr,
                             int x0, int y0,
@@ -810,7 +810,7 @@ get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
 
 /* Can involve a lot of unnecessary checks for border color:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
                   const struct sp_sampler *sp_samp,
                   union tex_tile_address addr,
@@ -828,7 +828,7 @@ get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
 
 /* 3d variants:
  */
-static INLINE const float *
+static inline const float *
 get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y, int z)
 {
@@ -846,7 +846,7 @@ get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_3d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y, int z)
@@ -866,7 +866,7 @@ get_texel_3d(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 1D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_1d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y)
@@ -884,7 +884,7 @@ get_texel_1d_array(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 2D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_2d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y, int layer)
@@ -905,7 +905,7 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
                         union tex_tile_address addr, int x, int y,
                         float *corner, int layer, unsigned face)
@@ -960,7 +960,7 @@ get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for cube array texture */
-static INLINE const float *
+static inline const float *
 get_texel_cube_array(const struct sp_sampler_view *sp_sview,
                      const struct sp_sampler *sp_samp,
                      union tex_tile_address addr, int x, int y, int layer)
@@ -986,7 +986,7 @@ get_texel_cube_array(const struct sp_sampler_view *sp_sview,
  * If level = 2, then we'll return 64 (the width at level=2).
  * Return 1 if level > base_pot.
  */
-static INLINE unsigned
+static inline unsigned
 pot_level_size(unsigned base_pot, unsigned level)
 {
    return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
@@ -1016,7 +1016,7 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ
 
 /* Some image-filter fastpaths:
  */
-static INLINE void
+static inline void
 img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1070,7 +1070,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
                                  struct sp_sampler *sp_samp,
                                  const struct img_filter_args *args,
@@ -1104,7 +1104,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1819,7 +1819,7 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod returns the per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lod(const struct pipe_sampler_state *sampler,
             enum tgsi_sampler_control control,
             const float biased_lambda,
@@ -1859,7 +1859,7 @@ compute_lod(const struct pipe_sampler_state *sampler,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod results per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lambda_lod(struct sp_sampler_view *sp_sview,
                    struct sp_sampler *sp_samp,
                    const float s[TGSI_QUAD_SIZE],
@@ -1906,7 +1906,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 get_gather_component(const float lod_in[TGSI_QUAD_SIZE])
 {
    /* gather component is stored in lod_in slot as unsigned */
@@ -2789,7 +2789,7 @@ get_linear_wrap(unsigned mode)
 /**
  * Is swizzling needed for the given state key?
  */
-static INLINE bool
+static inline bool
 any_swizzle(const struct pipe_sampler_view *view)
 {
    return (view->swizzle_r != PIPE_SWIZZLE_RED ||
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index 4a421a8f882..21f38b2f859 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -185,7 +185,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
  * This is basically a direct-map cache.
  * XXX There's probably lots of ways in which we can improve this.
  */
-static INLINE uint
+static inline uint
 tex_cache_pos( union tex_tile_address addr )
 {
    uint entry = (addr.bits.x + 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 2233effc439..b7ad222d715 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -127,7 +127,7 @@ extern const struct softpipe_tex_cached_tile *
 sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                         union tex_tile_address addr );
 
-static INLINE union tex_tile_address
+static inline union tex_tile_address
 tex_tile_address( unsigned x,
                   unsigned y,
                   unsigned z,
@@ -147,7 +147,7 @@ tex_tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE const struct softpipe_tex_cached_tile *
+static inline const struct softpipe_tex_cached_tile *
 sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                        union tex_tile_address addr )
 {
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 1701bf574d9..fbf741a9c72 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -81,13 +81,13 @@ struct softpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct softpipe_resource *
+static inline struct softpipe_resource *
 softpipe_resource(struct pipe_resource *pt)
 {
    return (struct softpipe_resource *) pt;
 }
 
-static INLINE struct softpipe_transfer *
+static inline struct softpipe_transfer *
 softpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct softpipe_transfer *) pt;
@@ -99,7 +99,7 @@ softpipe_transfer(struct pipe_transfer *pt)
  * This is a short-cut instead of using map()/unmap(), which should
  * probably be fixed.
  */
-static INLINE void *
+static inline void *
 softpipe_resource_data(struct pipe_resource *pt)
 {
    if (!pt)
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b763f526e61..9cc8ac12525 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -52,7 +52,7 @@ sp_alloc_tile(struct softpipe_tile_cache *tc);
    (((x) + (y) * 5 + (l) * 10) % NUM_ENTRIES)
 
 
-static INLINE int addr_to_clear_pos(union tile_address addr)
+static inline int addr_to_clear_pos(union tile_address addr)
 {
    int pos;
    pos = addr.bits.layer * (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE);
@@ -63,7 +63,7 @@ static INLINE int addr_to_clear_pos(union tile_address addr)
 /**
  * Is the tile at (x,y) in cleared state?
  */
-static INLINE uint
+static inline uint
 is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos, bit;
@@ -77,7 +77,7 @@ is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 /**
  * Mark the tile at (x,y) as not cleared.
  */
-static INLINE void
+static inline void
 clear_clear_flag(uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos;
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 167e1ffcada..2c0bafad651 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -128,7 +128,7 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc,
                     union tile_address addr );
 
 
-static INLINE union tile_address
+static inline union tile_address
 tile_address( unsigned x,
               unsigned y, unsigned layer )
 {
@@ -143,7 +143,7 @@ tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE struct softpipe_cached_tile *
+static inline struct softpipe_cached_tile *
 sp_get_cached_tile(struct softpipe_tile_cache *tc, 
                    int x, int y, int layer )
 {
diff --git a/src/gallium/drivers/svga/Makefile.am b/src/gallium/drivers/svga/Makefile.am
index e0a8cad7208..d46de95e4b4 100644
--- a/src/gallium/drivers/svga/Makefile.am
+++ b/src/gallium/drivers/svga/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
index bb4d034f1eb..0ee624616f9 100644
--- a/src/gallium/drivers/svga/SConscript
+++ b/src/gallium/drivers/svga/SConscript
@@ -11,7 +11,6 @@ if env['suncc']:
 if env['gcc'] or env['clang']:
 	env.Append(CPPDEFINES = [
 		'HAVE_STDINT_H', 
-		'HAVE_SYS_TYPES_H',
 	])
 	
 env.Prepend(CPPPATH = [
diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
index 355edfdb702..5e00906ce36 100644
--- a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
+++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
@@ -507,7 +507,7 @@ static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1;
  *----------------------------------------------------------------------
  */
 
-static INLINE SVGA3dShaderRegType
+static inline SVGA3dShaderRegType
 SVGA3dShaderGetRegType(uint32 token)
 {
    SVGA3dShaderSrcToken src;
diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h
index 0f242dd402c..ccbf7912e6d 100644
--- a/src/gallium/drivers/svga/include/svga_overlay.h
+++ b/src/gallium/drivers/svga/include/svga_overlay.h
@@ -133,7 +133,7 @@ struct {
  *----------------------------------------------------------------------
  */
 
-static INLINE Bool
+static inline Bool
 VMwareVideoGetAttributes(const SVGAOverlayFormat format,    // IN
                          uint32 *width,                     // IN / OUT
                          uint32 *height,                    // IN / OUT
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 474b75c3c86..b271832171d 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -57,7 +57,7 @@
  *----------------------------------------------------------------------
  */
 
-static INLINE void
+static inline void
 surface_to_surfaceid(struct svga_winsys_context *swc, // IN
                      struct pipe_surface *surface,    // IN
                      SVGA3dSurfaceImageId *id,        // OUT
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 630f5f77d66..71f038df8c1 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -485,20 +485,20 @@ svga_context_create(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct svga_context *
+static inline struct svga_context *
 svga_context( struct pipe_context *pipe )
 {
    return (struct svga_context *)pipe;
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_objects(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_objects;
 }
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_dma(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_dma;
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
index 3a3fcd8fae2..82c9b602d5d 100644
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -53,7 +53,7 @@ extern int SVGA_DEBUG;
 #define DBSTR(x) ""
 #endif
 
-static INLINE void
+static inline void
 SVGA_DBG( unsigned flag, const char *fmt, ... )
 {
 #ifdef DEBUG 
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
index 1b054038e9f..9ab87e8259a 100644
--- a/src/gallium/drivers/svga/svga_draw_private.h
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -57,7 +57,7 @@ static const unsigned svga_hw_prims =
  * PIPE_PRIM_QUADS, PIPE_PRIM_QUAD_STRIP or PIPE_PRIM_POLYGON.  We convert
  * those to other types of primitives with index/translation code.
  */
-static INLINE unsigned
+static inline unsigned
 svga_translate_prim(unsigned mode, unsigned vcount,unsigned *prim_count)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index 594eec7166e..2890516c0cf 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -33,7 +33,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -58,7 +58,7 @@ svga_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_func(unsigned mode)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index cb07dbe09a3..8db21fd7476 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -32,7 +32,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -50,7 +50,7 @@ svga_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_stencil_op(unsigned op)
 {
    switch (op) {
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index a97a9c46cf8..208a2cd14bf 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -59,7 +59,7 @@ struct svga_query {
 
 
 /** cast wrapper */
-static INLINE struct svga_query *
+static inline struct svga_query *
 svga_query( struct pipe_query *q )
 {
    return (struct svga_query *)q;
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 8a87bb467aa..effd490dd22 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -35,7 +35,7 @@
 
 #include "svga_debug.h"
 
-static INLINE unsigned
+static inline unsigned
 translate_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -68,7 +68,7 @@ translate_wrap_mode(unsigned wrap)
    }
 }
 
-static INLINE unsigned translate_img_filter( unsigned filter )
+static inline unsigned translate_img_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
@@ -79,7 +79,7 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    }
 }
 
-static INLINE unsigned translate_mip_filter( unsigned filter )
+static inline unsigned translate_mip_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_MIPFILTER_NONE:    return SVGA3D_TEX_FILTER_NONE;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index d2c7762e7ff..13f85cddbd5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -45,7 +45,7 @@
  * Vertex and index buffers need hardware backing.  Constant buffers
  * do not.  No other types of buffers currently supported.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_needs_hw_storage(unsigned usage)
 {
    return usage & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index 83b3d342aec..e838beb6661 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -190,7 +190,7 @@ struct svga_buffer
 };
 
 
-static INLINE struct svga_buffer *
+static inline struct svga_buffer *
 svga_buffer(struct pipe_resource *buffer)
 {
    if (buffer) {
@@ -205,7 +205,7 @@ svga_buffer(struct pipe_resource *buffer)
  * Returns TRUE for user buffers.  We may
  * decide to use an alternate upload path for these buffers.
  */
-static INLINE boolean 
+static inline boolean 
 svga_buffer_is_user_buffer( struct pipe_resource *buffer )
 {
    if (buffer) {
@@ -219,7 +219,7 @@ svga_buffer_is_user_buffer( struct pipe_resource *buffer )
  * Returns a pointer to a struct svga_winsys_screen given a
  * struct svga_buffer.
  */
-static INLINE struct svga_winsys_screen *
+static inline struct svga_winsys_screen *
 svga_buffer_winsys_screen(struct svga_buffer *sbuf)
 {
    return svga_screen(sbuf->b.b.screen)->sws;
@@ -230,7 +230,7 @@ svga_buffer_winsys_screen(struct svga_buffer *sbuf)
  * Returns whether a buffer has hardware storage that is
  * visible to the GPU.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 {
    if (svga_buffer_winsys_screen(sbuf)->have_gb_objects)
@@ -242,7 +242,7 @@ svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 /**
  * Map the hardware storage of a buffer.
  */
-static INLINE void *
+static inline void *
 svga_buffer_hw_storage_map(struct svga_context *svga,
                            struct svga_buffer *sbuf,
                            unsigned flags, boolean *retry)
@@ -259,7 +259,7 @@ svga_buffer_hw_storage_map(struct svga_context *svga,
 /**
  * Unmap the hardware storage of a buffer.
  */
-static INLINE void
+static inline void
 svga_buffer_hw_storage_unmap(struct svga_context *svga,
                              struct svga_buffer *sbuf)
 {
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h
index 1ff42fabab9..19dadfb8828 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -106,7 +106,7 @@ struct svga_transfer
 };
 
 
-static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource )
+static inline struct svga_texture *svga_texture( struct pipe_resource *resource )
 {
    struct svga_texture *tex = (struct svga_texture *)resource;
    assert(tex == NULL || tex->b.vtbl == &svga_texture_vtbl);
@@ -114,7 +114,7 @@ static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource
 }
 
 
-static INLINE struct svga_transfer *
+static inline struct svga_transfer *
 svga_transfer(struct pipe_transfer *transfer)
 {
    assert(transfer);
@@ -127,7 +127,7 @@ svga_transfer(struct pipe_transfer *transfer)
  * This is used to track updates to textures when we draw into
  * them via a surface.
  */
-static INLINE void
+static inline void
 svga_age_texture_view(struct svga_texture *tex, unsigned level)
 {
    assert(level < Elements(tex->view_age));
@@ -138,7 +138,7 @@ svga_age_texture_view(struct svga_texture *tex, unsigned level)
 /**
  * Mark the given texture face/level as being defined.
  */
-static INLINE void
+static inline void
 svga_define_texture_level(struct svga_texture *tex,
                           unsigned face,unsigned level)
 {
@@ -148,7 +148,7 @@ svga_define_texture_level(struct svga_texture *tex,
 }
 
 
-static INLINE bool
+static inline bool
 svga_is_texture_level_defined(const struct svga_texture *tex,
                               unsigned face, unsigned level)
 {
@@ -177,7 +177,7 @@ check_face_level(const struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_set_texture_rendered_to(struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
@@ -186,7 +186,7 @@ svga_set_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_clear_texture_rendered_to(struct svga_texture *tex,
                                unsigned face, unsigned level)
 {
@@ -195,7 +195,7 @@ svga_clear_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_was_texture_rendered_to(const struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
index 2087c1be85e..7f14323f84f 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -86,7 +86,7 @@ svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
 void
 svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv);
 
-static INLINE void
+static inline void
 svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
 {
    struct svga_sampler_view *old = *ptr;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 56e486786df..66c3deaa9e7 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -309,6 +309,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
 
@@ -443,7 +447,9 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       return 0;
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_COMPUTE:
-      /* no support for geometry or compute shaders at this time */
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      /* no support for geometry, tess or compute shaders at this time */
       return 0;
    default:
       debug_printf("Unexpected shader type (%u) query\n", shader);
@@ -543,21 +549,15 @@ svga_fence_reference(struct pipe_screen *screen,
 
 
 static boolean
-svga_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
-   return sws->fence_signalled(sws, fence, 0) == 0;
-}
-
-
-static boolean
 svga_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
                   uint64_t timeout)
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
 
+   if (!timeout)
+      return sws->fence_signalled(sws, fence, 0) == 0;
+
    SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
             __FUNCTION__, fence);
 
@@ -645,7 +645,6 @@ svga_screen_create(struct svga_winsys_screen *sws)
    screen->is_format_supported = svga_is_format_supported;
    screen->context_create = svga_context_create;
    screen->fence_reference = svga_fence_reference;
-   screen->fence_signalled = svga_fence_signalled;
    screen->fence_finish = svga_fence_finish;
    screen->get_driver_query_info = svga_get_driver_query_info;
    svgascreen->sws = sws;
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index b85191c4b26..ea1e743dfe5 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -82,7 +82,7 @@ struct svga_screen
 
 #ifndef DEBUG
 /** cast wrapper */
-static INLINE struct svga_screen *
+static inline struct svga_screen *
 svga_screen(struct pipe_screen *pscreen)
 {
    return (struct svga_screen *) pscreen;
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index f63f7836187..3c765394a88 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -76,7 +76,7 @@ surface_size(const struct svga_host_surface_cache_key *key)
 /**
  * Compute the bucket for this key.
  */
-static INLINE unsigned
+static inline unsigned
 svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key)
 {
    return util_hash_crc32(key, sizeof *key) % SVGA_HOST_SURFACE_CACHE_BUCKETS;
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index fd500ae4401..5102159b96a 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -44,7 +44,7 @@ svga_destroy_shader_variant(struct svga_context *svga,
 /**
  * Check if a shader's bytecode exceeds the device limits.
  */
-static INLINE boolean
+static inline boolean
 svga_shader_too_large(const struct svga_context *svga,
                       const struct svga_shader_variant *variant)
 {
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 566a79407e5..8cdce742b3b 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -41,7 +41,7 @@
 
 
 
-static INLINE int
+static inline int
 compare_fs_keys(const struct svga_fs_compile_key *a,
                 const struct svga_fs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
index fb56b3d36ba..ebb98373e2b 100644
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -61,7 +61,7 @@ do {                                                            \
 } while (0)
 
 
-static INLINE void
+static inline void
 svga_queue_rs( struct rs_queue *q,
                unsigned rss,
                unsigned value )
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 0ab571c0588..41334bd7cb9 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -274,7 +274,7 @@ do {                                                                    \
 } while (0)
 
 
-static INLINE void 
+static inline void 
 svga_queue_tss( struct ts_queue *q,
                 unsigned unit,
                 unsigned tss,
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 545c9d7420f..c2a0f1ee6b1 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -41,7 +41,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE int
+static inline int
 compare_vs_keys(const struct svga_vs_compile_key *a,
                 const struct svga_vs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index 7b8f6f018d2..2fa72a1c8f0 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -84,7 +84,7 @@ svga_texture_copy_handle(struct svga_context *svga,
                          unsigned width, unsigned height, unsigned depth);
 
 
-static INLINE struct svga_surface *
+static inline struct svga_surface *
 svga_surface(struct pipe_surface *surface)
 {
    assert(surface);
@@ -92,7 +92,7 @@ svga_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE const struct svga_surface *
+static inline const struct svga_surface *
 svga_surface_const(const struct pipe_surface *surface)
 {
    assert(surface);
diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h
index 608950d7af6..e2106e1e8e6 100644
--- a/src/gallium/drivers/svga/svga_swtnl_private.h
+++ b/src/gallium/drivers/svga/svga_swtnl_private.h
@@ -76,7 +76,7 @@ struct svga_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct svga_vbuf_render *
+static inline struct svga_vbuf_render *
 svga_vbuf_render( struct vbuf_render *render )
 {
    assert(render);
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 9aafd851264..2e2ff5e4673 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -84,7 +84,7 @@ svga_shader_expand(struct svga_shader_emitter *emit)
 }
 
 
-static INLINE boolean
+static inline boolean
 reserve(struct svga_shader_emitter *emit, unsigned nr_dwords)
 {
    if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) {
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index e7a2a134ca5..5c47a4ad39f 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -124,7 +124,7 @@ struct svga_shader_variant
  * The real use of this information is matching vertex elements to
  * fragment shader inputs in the case where vertex shader is disabled.
  */
-static INLINE void svga_generate_vdecl_semantics( unsigned idx,
+static inline void svga_generate_vdecl_semantics( unsigned idx,
                                                   unsigned *usage,
                                                   unsigned *usage_index )
 {
@@ -140,12 +140,12 @@ static INLINE void svga_generate_vdecl_semantics( unsigned idx,
 
 
 
-static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
+static inline unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 {
    return sizeof *key;
 }
 
-static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
+static inline unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
    return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 1894296e6d7..1a1dac23507 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -167,7 +167,7 @@ svga_translate_decl_sm30(struct svga_shader_emitter *emit,
 
 
 /** Emit the given SVGA3dShaderInstToken opcode */
-static INLINE boolean
+static inline boolean
 emit_instruction(struct svga_shader_emitter *emit,
                  SVGA3dShaderInstToken opcode)
 {
@@ -176,7 +176,7 @@ emit_instruction(struct svga_shader_emitter *emit,
 
 
 /** Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -192,7 +192,7 @@ inst_token(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode
  * with the predication flag set.
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_predicated(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -209,7 +209,7 @@ inst_token_predicated(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for a SETP instruction (set predicate)
  * using the given comparison operator (one of SVGA3DOPCOMP_xx).
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_setp(unsigned operator)
 {
    SVGA3dShaderInstToken inst;
@@ -227,7 +227,7 @@ inst_token_setp(unsigned operator)
  * Note that this function is used to create tokens for output registers,
  * temp registers AND constants (see emit_def_const()).
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst_register(unsigned file, int number)
 {
    SVGA3dShaderDestToken dest;
@@ -255,7 +255,7 @@ dst_register(unsigned file, int number)
  * Apply a writemask to the given SVGA3dShaderDestToken, returning a
  * new SVGA3dShaderDestToken.
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 writemask(SVGA3dShaderDestToken dest, unsigned mask)
 {
    assert(dest.mask & mask);
@@ -265,7 +265,7 @@ writemask(SVGA3dShaderDestToken dest, unsigned mask)
 
 
 /** Create a SVGA3dShaderSrcToken given a register file and number */
-static INLINE SVGA3dShaderSrcToken
+static inline SVGA3dShaderSrcToken
 src_token(unsigned file, int number)
 {
    SVGA3dShaderSrcToken src;
@@ -289,7 +289,7 @@ src_token(unsigned file, int number)
 
 
 /** Create a src_register given a register file and register number */
-static INLINE struct src_register
+static inline struct src_register
 src_register(unsigned file, int number)
 {
    struct src_register src;
@@ -301,7 +301,7 @@ src_register(unsigned file, int number)
 }
 
 /** Translate src_register into SVGA3dShaderDestToken */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst(struct src_register src)
 {
    return dst_register(SVGA3dShaderGetRegType(src.base.value), src.base.num);
@@ -309,7 +309,7 @@ dst(struct src_register src)
 
 
 /** Translate SVGA3dShaderDestToken to a src_register */
-static INLINE struct src_register
+static inline struct src_register
 src(SVGA3dShaderDestToken dst)
 {
    return src_register(SVGA3dShaderGetRegType(dst.value), dst.num);
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h
index 5db64bf135b..0a2e3d5f345 100644
--- a/src/gallium/drivers/svga/svgadump/svga_shader.h
+++ b/src/gallium/drivers/svga/svgadump/svga_shader.h
@@ -56,7 +56,7 @@ struct sh_reg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_reg_type( struct sh_reg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -138,7 +138,7 @@ struct sh_dstreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_dstreg_type( struct sh_dstreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -169,7 +169,7 @@ struct sh_srcreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_srcreg_type( struct sh_srcreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 0013c963e7a..7f6d0645112 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -49,13 +49,13 @@ struct trace_query
 };
 
 
-static INLINE struct trace_query *
+static inline struct trace_query *
 trace_query(struct pipe_query *query) {
    return (struct trace_query *)query;
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_query_unwrap(struct pipe_query *query)
 {
    if (query) {
@@ -66,7 +66,7 @@ trace_query_unwrap(struct pipe_query *query)
 }
 
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 trace_resource_unwrap(struct trace_context *tr_ctx,
                      struct pipe_resource *resource)
 {
@@ -82,7 +82,7 @@ trace_resource_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 trace_surface_unwrap(struct trace_context *tr_ctx,
                      struct pipe_surface *surface)
 {
@@ -105,7 +105,7 @@ trace_surface_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE void
+static inline void
 trace_context_draw_vbo(struct pipe_context *_pipe,
                        const struct pipe_draw_info *info)
 {
@@ -125,7 +125,7 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_context_create_query(struct pipe_context *_pipe,
                            unsigned query_type,
                            unsigned index)
@@ -163,7 +163,7 @@ trace_context_create_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy_query(struct pipe_context *_pipe,
                             struct pipe_query *_query)
 {
@@ -185,7 +185,7 @@ trace_context_destroy_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_begin_query(struct pipe_context *_pipe,
                           struct pipe_query *query)
 {
@@ -207,7 +207,7 @@ trace_context_begin_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_end_query(struct pipe_context *_pipe,
                         struct pipe_query *query)
 {
@@ -227,7 +227,7 @@ trace_context_end_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_get_query_result(struct pipe_context *_pipe,
                                struct pipe_query *_query,
                                boolean wait,
@@ -262,7 +262,7 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_blend_state(struct pipe_context *_pipe,
                                  const struct pipe_blend_state *state)
 {
@@ -285,7 +285,7 @@ trace_context_create_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_blend_state(struct pipe_context *_pipe,
                                void *state)
 {
@@ -303,7 +303,7 @@ trace_context_bind_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_blend_state(struct pipe_context *_pipe,
                                  void *state)
 {
@@ -321,7 +321,7 @@ trace_context_delete_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_sampler_state(struct pipe_context *_pipe,
                                    const struct pipe_sampler_state *state)
 {
@@ -344,7 +344,7 @@ trace_context_create_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_sampler_states(struct pipe_context *_pipe,
                                   unsigned shader,
                                   unsigned start,
@@ -371,7 +371,7 @@ trace_context_bind_sampler_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_sampler_state(struct pipe_context *_pipe,
                                    void *state)
 {
@@ -389,7 +389,7 @@ trace_context_delete_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_rasterizer_state(struct pipe_context *_pipe,
                                       const struct pipe_rasterizer_state *state)
 {
@@ -412,7 +412,7 @@ trace_context_create_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
                                     void *state)
 {
@@ -430,7 +430,7 @@ trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
                                       void *state)
 {
@@ -448,7 +448,7 @@ trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -471,7 +471,7 @@ trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                              void *state)
 {
@@ -489,7 +489,7 @@ trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                void *state)
 {
@@ -508,7 +508,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
 
 
 #define TRACE_SHADER_STATE(shader_type) \
-   static INLINE void * \
+   static inline void * \
    trace_context_create_##shader_type##_state(struct pipe_context *_pipe, \
                                  const struct pipe_shader_state *state) \
    { \
@@ -524,7 +524,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       return result; \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_bind_##shader_type##_state(struct pipe_context *_pipe, \
                                void *state) \
    { \
@@ -537,7 +537,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       trace_dump_call_end(); \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_delete_##shader_type##_state(struct pipe_context *_pipe, \
                                  void *state) \
    { \
@@ -559,7 +559,7 @@ TRACE_SHADER_STATE(tes)
 #undef TRACE_SHADER_STATE
 
 
-static INLINE void *
+static inline void *
 trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
                                            unsigned num_elements,
                                            const struct  pipe_vertex_element *elements)
@@ -587,7 +587,7 @@ trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
                                          void *state)
 {
@@ -605,7 +605,7 @@ trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
                                            void *state)
 {
@@ -623,7 +623,7 @@ trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_blend_color(struct pipe_context *_pipe,
                               const struct pipe_blend_color *state)
 {
@@ -641,7 +641,7 @@ trace_context_set_blend_color(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stencil_ref(struct pipe_context *_pipe,
                               const struct pipe_stencil_ref *state)
 {
@@ -659,7 +659,7 @@ trace_context_set_stencil_ref(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_clip_state(struct pipe_context *_pipe,
                              const struct pipe_clip_state *state)
 {
@@ -676,7 +676,7 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_sample_mask(struct pipe_context *_pipe,
                               unsigned sample_mask)
 {
@@ -693,7 +693,7 @@ trace_context_set_sample_mask(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_constant_buffer(struct pipe_context *_pipe,
                                   uint shader, uint index,
                                   struct pipe_constant_buffer *constant_buffer)
@@ -721,7 +721,7 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_framebuffer_state(struct pipe_context *_pipe,
                                     const struct pipe_framebuffer_state *state)
 {
@@ -751,7 +751,7 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_polygon_stipple(struct pipe_context *_pipe,
                                   const struct pipe_poly_stipple *state)
 {
@@ -769,7 +769,7 @@ trace_context_set_polygon_stipple(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_scissor_states(struct pipe_context *_pipe,
                                  unsigned start_slot,
                                  unsigned num_scissors,
@@ -791,7 +791,7 @@ trace_context_set_scissor_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_viewport_states(struct pipe_context *_pipe,
                                   unsigned start_slot,
                                   unsigned num_viewports,
@@ -938,7 +938,7 @@ trace_context_surface_destroy(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_sampler_views(struct pipe_context *_pipe,
                                 unsigned shader,
                                 unsigned start,
@@ -974,7 +974,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_vertex_buffers(struct pipe_context *_pipe,
                                  unsigned start_slot, unsigned num_buffers,
                                  const struct pipe_vertex_buffer *buffers)
@@ -1008,7 +1008,7 @@ trace_context_set_vertex_buffers(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_index_buffer(struct pipe_context *_pipe,
                                const struct pipe_index_buffer *ib)
 {
@@ -1033,7 +1033,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_stream_output_target *
+static inline struct pipe_stream_output_target *
 trace_context_create_stream_output_target(struct pipe_context *_pipe,
                                           struct pipe_resource *res,
                                           unsigned buffer_offset,
@@ -1063,7 +1063,7 @@ trace_context_create_stream_output_target(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_stream_output_target_destroy(
    struct pipe_context *_pipe,
    struct pipe_stream_output_target *target)
@@ -1082,7 +1082,7 @@ trace_context_stream_output_target_destroy(
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stream_output_targets(struct pipe_context *_pipe,
                                         unsigned num_targets,
                                         struct pipe_stream_output_target **tgs,
@@ -1104,7 +1104,7 @@ trace_context_set_stream_output_targets(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_resource_copy_region(struct pipe_context *_pipe,
                                    struct pipe_resource *dst,
                                    unsigned dst_level,
@@ -1139,7 +1139,7 @@ trace_context_resource_copy_region(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_blit(struct pipe_context *_pipe,
                    const struct pipe_blit_info *_info)
 {
@@ -1181,7 +1181,7 @@ trace_context_flush_resource(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear(struct pipe_context *_pipe,
                     unsigned buffers,
                     const union pipe_color_union *color,
@@ -1210,7 +1210,7 @@ trace_context_clear(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear_render_target(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   const union pipe_color_union *color,
@@ -1237,7 +1237,7 @@ trace_context_clear_render_target(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_clear_depth_stencil(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   unsigned clear_flags,
@@ -1269,7 +1269,7 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_flush(struct pipe_context *_pipe,
                     struct pipe_fence_handle **fence,
                     unsigned flags)
@@ -1291,7 +1291,7 @@ trace_context_flush(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy(struct pipe_context *_pipe)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 1e5ad88d034..ad57d9d5243 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -54,7 +54,7 @@ void
 trace_context_check(const struct pipe_context *pipe);
 
 
-static INLINE struct trace_context *
+static inline struct trace_context *
 trace_context(struct pipe_context *pipe)
 {
    assert(pipe);
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 753b92d8b54..601e2cbbec5 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -64,7 +64,7 @@ static long unsigned call_no = 0;
 static boolean dumping = FALSE;
 
 
-static INLINE void
+static inline void
 trace_dump_write(const char *buf, size_t size)
 {
    if (stream) {
@@ -73,14 +73,14 @@ trace_dump_write(const char *buf, size_t size)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writes(const char *s)
 {
    trace_dump_write(s, strlen(s));
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writef(const char *format, ...)
 {
    static char buf[1024];
@@ -93,7 +93,7 @@ trace_dump_writef(const char *format, ...)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_escape(const char *str)
 {
    const unsigned char *p = (const unsigned char *)str;
@@ -117,7 +117,7 @@ trace_dump_escape(const char *str)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_indent(unsigned level)
 {
    unsigned i;
@@ -126,14 +126,14 @@ trace_dump_indent(unsigned level)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_newline(void)
 {
    trace_dump_writes("\n");
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag(const char *name)
 {
    trace_dump_writes("<");
@@ -142,7 +142,7 @@ trace_dump_tag(const char *name)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin(const char *name)
 {
    trace_dump_writes("<");
@@ -150,7 +150,7 @@ trace_dump_tag_begin(const char *name)
    trace_dump_writes(">");
 }
 
-static INLINE void
+static inline void
 trace_dump_tag_begin1(const char *name,
                       const char *attr1, const char *value1)
 {
@@ -164,7 +164,7 @@ trace_dump_tag_begin1(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin2(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2)
@@ -183,7 +183,7 @@ trace_dump_tag_begin2(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin3(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2,
@@ -207,7 +207,7 @@ trace_dump_tag_begin3(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_end(const char *name)
 {
    trace_dump_writes("</");
diff --git a/src/gallium/drivers/trace/tr_dump_defines.h b/src/gallium/drivers/trace/tr_dump_defines.h
index 0c83c2b68f1..b38d63eac59 100644
--- a/src/gallium/drivers/trace/tr_dump_defines.h
+++ b/src/gallium/drivers/trace/tr_dump_defines.h
@@ -34,7 +34,7 @@
 #include "tr_dump.h"
 
 
-static INLINE void
+static inline void
 trace_dump_format(enum pipe_format format)
 {
    if (!trace_dumping_enabled_locked())
@@ -44,7 +44,7 @@ trace_dump_format(enum pipe_format format)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_query_type(unsigned value)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 266626defa8..1d86a378eea 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -370,29 +370,6 @@ trace_screen_fence_reference(struct pipe_screen *_screen,
 
 
 static boolean
-trace_screen_fence_signalled(struct pipe_screen *_screen,
-                             struct pipe_fence_handle *fence)
-{
-   struct trace_screen *tr_scr = trace_screen(_screen);
-   struct pipe_screen *screen = tr_scr->screen;
-   int result;
-
-   trace_dump_call_begin("pipe_screen", "fence_signalled");
-
-   trace_dump_arg(ptr, screen);
-   trace_dump_arg(ptr, fence);
-
-   result = screen->fence_signalled(screen, fence);
-
-   trace_dump_ret(bool, result);
-
-   trace_dump_call_end();
-
-   return result;
-}
-
-
-static boolean
 trace_screen_fence_finish(struct pipe_screen *_screen,
                           struct pipe_fence_handle *fence,
                           uint64_t timeout)
@@ -503,7 +480,6 @@ trace_screen_create(struct pipe_screen *screen)
    tr_scr->base.resource_get_handle = trace_screen_resource_get_handle;
    tr_scr->base.resource_destroy = trace_screen_resource_destroy;
    tr_scr->base.fence_reference = trace_screen_fence_reference;
-   tr_scr->base.fence_signalled = trace_screen_fence_signalled;
    tr_scr->base.fence_finish = trace_screen_fence_finish;
    tr_scr->base.flush_frontbuffer = trace_screen_flush_frontbuffer;
    tr_scr->base.get_timestamp = trace_screen_get_timestamp;
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
index 5e45c3c2f8f..e48b7b39e24 100644
--- a/src/gallium/drivers/trace/tr_texture.h
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -85,7 +85,7 @@ struct trace_transfer
 };
 
 
-static INLINE struct trace_resource *
+static inline struct trace_resource *
 trace_resource(struct pipe_resource *texture)
 {
    if(!texture)
@@ -95,7 +95,7 @@ trace_resource(struct pipe_resource *texture)
 }
 
 
-static INLINE struct trace_surface *
+static inline struct trace_surface *
 trace_surface(struct pipe_surface *surface)
 {
    if(!surface)
@@ -105,7 +105,7 @@ trace_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE struct trace_sampler_view *
+static inline struct trace_sampler_view *
 trace_sampler_view(struct pipe_sampler_view *sampler_view)
 {
    if (!sampler_view)
@@ -114,7 +114,7 @@ trace_sampler_view(struct pipe_sampler_view *sampler_view)
 }
 
 
-static INLINE struct trace_transfer *
+static inline struct trace_transfer *
 trace_transfer(struct pipe_transfer *transfer)
 {
    if(!transfer)
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3f62ce21a9f..f4a57ba3404 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
@@ -30,10 +28,10 @@ SIM_LDFLAGS = -lsimpenrose
 endif
 
 AM_CFLAGS = \
+	-I$(top_builddir)/src/glsl/nir \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
 	$()
 
 noinst_LTLIBRARIES = libvc4.la
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 1eb029e67e7..6fb40c20562 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,8 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_blend.c \
+	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
@@ -49,4 +51,5 @@ C_SOURCES := \
 	vc4_state.c \
 	vc4_tiling.c \
 	vc4_tiling.h \
+	vc4_uniforms.c \
 	$()
diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 1fd8aa9fb28..ffc973735ae 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -26,17 +26,6 @@
 
 #include "vc4_simulator_validate.h"
 
-enum vc4_bo_mode {
-	VC4_MODE_UNDECIDED,
-	VC4_MODE_RENDER,
-	VC4_MODE_SHADER,
-};
-
-struct vc4_bo_exec_state {
-	struct drm_gem_cma_object *bo;
-	enum vc4_bo_mode mode;
-};
-
 struct vc4_exec_info {
 	/* Sequence number for this bin/render job. */
 	uint64_t seqno;
@@ -47,7 +36,7 @@ struct vc4_exec_info {
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
-	struct vc4_bo_exec_state *bo;
+	struct drm_gem_cma_object **bo;
 	uint32_t bo_count;
 
 	/* List of other BOs used in the job that need to be released
@@ -72,7 +61,6 @@ struct vc4_exec_info {
 	 * command lists.
 	 */
 	struct vc4_shader_state {
-		uint8_t packet;
 		uint32_t addr;
 		/* Maximum vertex index referenced by any primitive using this
 		 * shader state.
@@ -88,6 +76,7 @@ struct vc4_exec_info {
 	bool found_tile_binning_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
+	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
 	struct drm_gem_cma_object *tile_bo;
 	uint32_t tile_alloc_offset;
@@ -99,6 +88,9 @@ struct vc4_exec_info {
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
 
+	/* Pointer to the unvalidated bin CL (if present). */
+	void *bin_u;
+
 	/* Pointers to the shader recs.  These paddr gets incremented as CL
 	 * packets are relocated in validate_gl_shader_state, and the vaddrs
 	 * (u and v) get incremented and size decremented as the shader recs
@@ -168,10 +160,8 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
-bool vc4_use_bo(struct vc4_exec_info *exec,
-		uint32_t hindex,
-		enum vc4_bo_mode mode,
-		struct drm_gem_cma_object **obj);
+struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec,
+				      uint32_t hindex);
 
 int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index e4b7fea5968..93f9ec7ed9b 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
 
+	exec->bin_u = bin;
+
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
 	exec->shader_rec_size = args->shader_rec_size;
diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 88cfc0fa9f0..771e2b78761 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -88,16 +88,22 @@ enum vc4_packet {
 #define VC4_PACKET_START_TILE_BINNING_SIZE				1
 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
 #define VC4_PACKET_GL_SHADER_STATE_SIZE					5
 #define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
 #define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
 #define VC4_PACKET_POINT_SIZE_SIZE					5
@@ -106,6 +112,7 @@ enum vc4_packet {
 #define VC4_PACKET_DEPTH_OFFSET_SIZE					5
 #define VC4_PACKET_CLIP_WINDOW_SIZE					9
 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
@@ -136,6 +143,16 @@ enum vc4_packet {
 
 /** @{
  *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
+/** @{
+ *
  * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address)
  */
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index e2d907ad91f..b827eb7e9e1 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		      struct vc4_rcl_setup *setup,
 		      uint8_t x, uint8_t y, bool first, bool last)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
 
 	/* Note that the load doesn't actually occur until the
 	 * tile coords packet is processed, and only one load
@@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 */
 	if (setup->color_read) {
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u16(setup, args->color_read.bits);
 		rcl_u32(setup,
-			setup->color_read->paddr +
-			exec->args->color_read.offset);
+			setup->color_read->paddr + args->color_read.offset);
 	}
 
 	if (setup->zs_read) {
@@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		}
 
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_read.bits);
-		rcl_u32(setup,
-			setup->zs_read->paddr + exec->args->zs_read.offset);
+		rcl_u16(setup, args->zs_read.bits);
+		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
 	}
 
 	/* Clipping depends on tile coordinates having been
@@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (setup->zs_write) {
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_write.bits |
+		rcl_u16(setup, args->zs_write.bits |
 			(setup->color_ms_write ?
 			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
 		rcl_u32(setup,
-			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			(setup->zs_write->paddr + args->zs_write.offset) |
 			((last && !setup->color_ms_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}
@@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec,
 static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 			     struct vc4_rcl_setup *setup)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
-	uint8_t min_x_tile = exec->args->min_x_tile;
-	uint8_t min_y_tile = exec->args->min_y_tile;
-	uint8_t max_x_tile = exec->args->max_x_tile;
-	uint8_t max_y_tile = exec->args->max_y_tile;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
 	uint8_t xtiles = max_x_tile - min_x_tile + 1;
 	uint8_t ytiles = max_y_tile - min_y_tile + 1;
 	uint8_t x, y;
@@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
 	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
 
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		size += VC4_PACKET_CLEAR_COLORS_SIZE +
 			VC4_PACKET_TILE_COORDINATES_SIZE +
 			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
@@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}
 
 	if (setup->zs_write)
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
 	if (setup->color_ms_write) {
 		if (setup->zs_write)
 			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
@@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	rcl_u32(setup,
 		(setup->color_ms_write ?
 		 (setup->color_ms_write->paddr +
-		  exec->args->color_ms_write.offset) :
+		  args->color_ms_write.offset) :
 		 0));
-	rcl_u16(setup, exec->args->width);
-	rcl_u16(setup, exec->args->height);
-	rcl_u16(setup, exec->args->color_ms_write.bits);
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_ms_write.bits);
 
 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
 	 * stale clear values in it, so we have to do a store in None mode (no
 	 * writes) so that we trigger the tile buffer clear.
 	 */
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
-		rcl_u32(setup, exec->args->clear_color[0]);
-		rcl_u32(setup, exec->args->clear_color[1]);
-		rcl_u32(setup, exec->args->clear_z);
-		rcl_u8(setup, exec->args->clear_s);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
 
 		vc4_tile_coordinates(setup, 0, 0);
 
@@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
@@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (tiling > VC4_TILING_FORMAT_LT) {
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index a0b67a7e50b..b248831113c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,42 +94,42 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-bool
-vc4_use_bo(struct vc4_exec_info *exec,
-	   uint32_t hindex,
-	   enum vc4_bo_mode mode,
-	   struct drm_gem_cma_object **obj)
+struct drm_gem_cma_object *
+vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex)
 {
-	*obj = NULL;
+	struct drm_gem_cma_object *obj;
+	struct drm_vc4_bo *bo;
 
 	if (hindex >= exec->bo_count) {
 		DRM_ERROR("BO index %d greater than BO count %d\n",
 			  hindex, exec->bo_count);
-		return false;
+		return NULL;
 	}
+	obj = exec->bo[hindex];
+	bo = to_vc4_bo(&obj->base);
 
-	if (exec->bo[hindex].mode != mode) {
-		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
-			exec->bo[hindex].mode = mode;
-		} else {
-			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
-				  hindex, exec->bo[hindex].mode, mode);
-			return false;
-		}
+	if (bo->validated_shader) {
+		DRM_ERROR("Trying to use shader BO as something other than "
+			  "a shader\n");
+		return NULL;
 	}
 
-	*obj = exec->bo[hindex].bo;
-	return true;
+	return obj;
+}
+
+static struct drm_gem_cma_object *
+vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]);
 }
 
 static bool
-vc4_use_handle(struct vc4_exec_info *exec,
-	       uint32_t gem_handles_packet_index,
-	       enum vc4_bo_mode mode,
-	       struct drm_gem_cma_object **obj)
+validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos)
 {
-	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
-			  mode, obj);
+	/* Note that the untrusted pointer passed to these functions is
+	 * incremented past the packet byte.
+	 */
+	return (untrusted - 1 == exec->bin_u + pos);
 }
 
 static uint32_t
@@ -201,14 +201,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }
 
+
 static int
-validate_flush_all(VALIDATE_ARGS)
+validate_flush(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) {
+		DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
+	exec->found_flush = true;
 
 	return 0;
 }
@@ -233,17 +234,13 @@ validate_start_tile_binning(VALIDATE_ARGS)
 static int
 validate_increment_semaphore(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) {
+		DRM_ERROR("Bin CL must end with "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
 		return -EINVAL;
 	}
 	exec->found_increment_semaphore_packet = true;
 
-	/* Once we've found the semaphore increment, there should be one FLUSH
-	 * then the end of the command list.  The FLUSH actually triggers the
-	 * increment, so we only need to make sure there
-	 */
-
 	return 0;
 }
 
@@ -257,11 +254,6 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -272,7 +264,8 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	if (max_index > shader_state->max_index)
 		shader_state->max_index = max_index;
 
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+	ib = vc4_use_handle(exec, 0);
+	if (!ib)
 		return -EINVAL;
 
 	if (offset > ib->base.size ||
@@ -295,11 +288,6 @@ validate_gl_array_primitive(VALIDATE_ARGS)
 	uint32_t max_index;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -329,7 +317,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
 	exec->shader_state[i].addr = *(uint32_t *)untrusted;
 	exec->shader_state[i].max_index = 0;
 
@@ -348,31 +335,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 }
 
 static int
-validate_nv_shader_state(VALIDATE_ARGS)
-{
-	uint32_t i = exec->shader_state_count++;
-
-	if (i >= exec->shader_state_size) {
-		DRM_ERROR("More requests for shader states than declared\n");
-		return -EINVAL;
-	}
-
-	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
-	exec->shader_state[i].addr = *(uint32_t *)untrusted;
-
-	if (exec->shader_state[i].addr & 15) {
-		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
-			  exec->shader_state[i].addr);
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = (exec->shader_state[i].addr +
-				  exec->shader_rec_p);
-
-	return 0;
-}
-
-static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_device *dev = exec->exec_bo->base.dev;
@@ -473,8 +435,8 @@ static const struct cmd_info {
 } cmd_info[] = {
 	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
 	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
@@ -488,7 +450,7 @@ static const struct cmd_info {
 	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
 
 	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+	/* We don't support validating NV shader states. */
 
 	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
@@ -525,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		u8 cmd = *(uint8_t *)src_pkt;
 		const struct cmd_info *info;
 
-		if (cmd > ARRAY_SIZE(cmd_info)) {
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
 			DRM_ERROR("0x%08x: packet %d out of bounds\n",
 				  src_offset, cmd);
 			return -EINVAL;
@@ -580,8 +542,16 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	/* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH.  The
+	 * semaphore is used to trigger the render CL to start up, and the
+	 * FLUSH is what caps the bin lists with
+	 * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main
+	 * render CL when they get called to) and actually triggers the queued
+	 * semaphore increment.
+	 */
+	if (!exec->found_increment_semaphore_packet || !exec->found_flush) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + "
+			  "VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
 
@@ -612,18 +582,19 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t cube_map_stride = 0;
 	enum vc4_texture_data_type type;
 
-	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+	tex = vc4_use_bo(exec, texture_handle_index);
+	if (!tex)
 		return false;
 
 	if (sample->is_direct) {
 		uint32_t remaining_size = tex->base.size - p0;
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
-			return false;
+			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
 			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
-			return false;
+			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
 		return true;
@@ -642,14 +613,14 @@ reloc_tex(struct vc4_exec_info *exec,
 		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
-				return false;
+				goto fail;
 			}
 
 			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
-			return false;
+			goto fail;
 		}
 	}
 
@@ -683,7 +654,7 @@ reloc_tex(struct vc4_exec_info *exec,
 	case VC4_TEXTURE_TYPE_YUV422R:
 	default:
 		DRM_ERROR("Texture format %d unsupported\n", type);
-		return false;
+		goto fail;
 	}
 	utile_w = utile_width(cpp);
 	utile_h = utile_height(cpp);
@@ -699,7 +670,7 @@ reloc_tex(struct vc4_exec_info *exec,
 
 	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
 				tiling_format, width, height, cpp)) {
-		return false;
+		goto fail;
 	}
 
 	/* The mipmap levels are stored before the base of the texture.  Make
@@ -740,7 +711,7 @@ reloc_tex(struct vc4_exec_info *exec,
 				  i, level_width, level_height,
 				  aligned_width, aligned_height,
 				  level_size, offset);
-			return false;
+			goto fail;
 		}
 
 		offset -= level_size;
@@ -749,54 +720,37 @@ reloc_tex(struct vc4_exec_info *exec,
 	*validated_p0 = tex->paddr + p0;
 
 	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
 }
 
 static int
-validate_shader_rec(struct drm_device *dev,
-		    struct vc4_exec_info *exec,
-		    struct vc4_shader_state *state)
+validate_gl_shader_rec(struct drm_device *dev,
+		       struct vc4_exec_info *exec,
+		       struct vc4_shader_state *state)
 {
 	uint32_t *src_handles;
 	void *pkt_u, *pkt_v;
-	enum shader_rec_reloc_type {
-		RELOC_CODE,
-		RELOC_VBO,
-	};
-	struct shader_rec_reloc {
-		enum shader_rec_reloc_type type;
-		uint32_t offset;
-	};
-	static const struct shader_rec_reloc gl_relocs[] = {
-		{ RELOC_CODE, 4 },  /* fs */
-		{ RELOC_CODE, 16 }, /* vs */
-		{ RELOC_CODE, 28 }, /* cs */
+	static const uint32_t shader_reloc_offsets[] = {
+		4, /* fs */
+		16, /* vs */
+		28, /* cs */
 	};
-	static const struct shader_rec_reloc nv_relocs[] = {
-		{ RELOC_CODE, 4 }, /* fs */
-		{ RELOC_VBO, 12 }
-	};
-	const struct shader_rec_reloc *relocs;
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
-	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets);
+	struct drm_gem_cma_object *bo[shader_reloc_count + 8];
+	uint32_t nr_attributes, nr_relocs, packet_size;
 	int i;
-	struct vc4_validated_shader_info *validated_shader = NULL;
-
-	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		relocs = nv_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
 
-		packet_size = 16;
-	} else {
-		relocs = gl_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
-
-		nr_attributes = state->addr & 0x7;
-		if (nr_attributes == 0)
-			nr_attributes = 8;
-		packet_size = gl_shader_rec_size(state->addr);
-	}
-	nr_relocs = nr_fixed_relocs + nr_attributes;
+	nr_attributes = state->addr & 0x7;
+	if (nr_attributes == 0)
+		nr_attributes = 8;
+	packet_size = gl_shader_rec_size(state->addr);
 
+	nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes;
 	if (nr_relocs * 4 > exec->shader_rec_size) {
 		DRM_ERROR("overflowed shader recs reading %d handles "
 			  "from %d bytes left\n",
@@ -826,21 +780,30 @@ validate_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	for (i = 0; i < nr_relocs; i++) {
-		enum vc4_bo_mode mode;
-
-		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
-			mode = VC4_MODE_SHADER;
-		else
-			mode = VC4_MODE_RENDER;
+	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
+		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
+		return -EINVAL;
+	}
 
-		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) {
-			return false;
+	for (i = 0; i < shader_reloc_count; i++) {
+		if (src_handles[i] > exec->bo_count) {
+			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
+			return -EINVAL;
 		}
+
+		bo[i] = exec->bo[src_handles[i]];
+		if (!bo[i])
+			return -EINVAL;
+	}
+	for (i = shader_reloc_count; i < nr_relocs; i++) {
+		bo[i] = vc4_use_bo(exec, src_handles[i]);
+		if (!bo[i])
+			return -EINVAL;
 	}
 
-	for (i = 0; i < nr_fixed_relocs; i++) {
-		uint32_t o = relocs[i].offset;
+	for (i = 0; i < shader_reloc_count; i++) {
+		struct vc4_validated_shader_info *validated_shader;
+		uint32_t o = shader_reloc_offsets[i];
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
 		void *uniform_data_u;
@@ -848,58 +811,50 @@ validate_shader_rec(struct drm_device *dev,
 
 		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
-		switch (relocs[i].type) {
-		case RELOC_CODE:
-			if (src_offset != 0) {
-				DRM_ERROR("Shaders must be at offset 0 of "
-					  "the BO.\n");
-				goto fail;
-			}
+		if (src_offset != 0) {
+			DRM_ERROR("Shaders must be at offset 0 of "
+				  "the BO.\n");
+			return -EINVAL;
+		}
 
-			kfree(validated_shader);
-			validated_shader = vc4_validate_shader(bo[i]);
-			if (!validated_shader)
-				goto fail;
+		validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader;
+		if (!validated_shader)
+			return -EINVAL;
 
-			if (validated_shader->uniforms_src_size >
-			    exec->uniforms_size) {
-				DRM_ERROR("Uniforms src buffer overflow\n");
-				goto fail;
-			}
+		if (validated_shader->uniforms_src_size >
+		    exec->uniforms_size) {
+			DRM_ERROR("Uniforms src buffer overflow\n");
+			return -EINVAL;
+		}
 
-			texture_handles_u = exec->uniforms_u;
-			uniform_data_u = (texture_handles_u +
-					  validated_shader->num_texture_samples);
-
-			memcpy(exec->uniforms_v, uniform_data_u,
-			       validated_shader->uniforms_size);
-
-			for (tex = 0;
-			     tex < validated_shader->num_texture_samples;
-			     tex++) {
-				if (!reloc_tex(exec,
-					       uniform_data_u,
-					       &validated_shader->texture_samples[tex],
-					       texture_handles_u[tex])) {
-					goto fail;
-				}
-			}
+		texture_handles_u = exec->uniforms_u;
+		uniform_data_u = (texture_handles_u +
+				  validated_shader->num_texture_samples);
 
-			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+		memcpy(exec->uniforms_v, uniform_data_u,
+		       validated_shader->uniforms_size);
 
-			exec->uniforms_u += validated_shader->uniforms_src_size;
-			exec->uniforms_v += validated_shader->uniforms_size;
-			exec->uniforms_p += validated_shader->uniforms_size;
+		for (tex = 0;
+		     tex < validated_shader->num_texture_samples;
+		     tex++) {
+			if (!reloc_tex(exec,
+				       uniform_data_u,
+				       &validated_shader->texture_samples[tex],
+				       texture_handles_u[tex])) {
+				return -EINVAL;
+			}
+		}
 
-			break;
+		*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
 
-		case RELOC_VBO:
-			break;
-		}
+		exec->uniforms_u += validated_shader->uniforms_src_size;
+		exec->uniforms_v += validated_shader->uniforms_size;
+		exec->uniforms_p += validated_shader->uniforms_size;
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
-		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		struct drm_gem_cma_object *vbo =
+			bo[ARRAY_SIZE(shader_reloc_offsets) + i];
 		uint32_t o = 36 + i * 8;
 		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
 		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
@@ -929,13 +884,7 @@ validate_shader_rec(struct drm_device *dev,
 		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
 	}
 
-	kfree(validated_shader);
-
 	return 0;
-
-fail:
-	kfree(validated_shader);
-	return -EINVAL;
 }
 
 int
@@ -946,7 +895,7 @@ vc4_validate_shader_recs(struct drm_device *dev,
 	int ret = 0;
 
 	for (i = 0; i < exec->shader_state_count; i++) {
-		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]);
 		if (ret)
 			return ret;
 	}
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index d29e2c9c318..e52a1941730 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
         struct vc4_context *vc4 = vc4_context(ctx);
 
         if (!util_blitter_is_blit_supported(vc4->blitter, info)) {
-                fprintf(stderr, "blit unsupported %s -> %s",
+                fprintf(stderr, "blit unsupported %s -> %s\n",
                     util_format_short_name(info->src.resource->format),
                     util_format_short_name(info->dst.resource->format));
                 return false;
@@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
             info.dst.resource->nr_samples <= 1 &&
             !util_format_is_depth_or_stencil(info.src.resource->format) &&
             !util_format_is_pure_integer(info.src.resource->format)) {
-                fprintf(stderr, "color resolve unimplemented");
+                fprintf(stderr, "color resolve unimplemented\n");
                 return;
         }
 
@@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
         }
 
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping");
+                fprintf(stderr, "cannot blit stencil, skipping\n");
                 info.mask &= ~PIPE_MASK_S;
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index cbdb9e89cf6..f7b41f5816d 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Broadcom
+ * Copyright © 2014-2015 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                  * allocate something new instead, since we assume that the
                  * user will proceed to CPU map it and fill it with stuff.
                  */
-                if (!vc4_bo_wait(bo, 0)) {
+                if (!vc4_bo_wait(bo, 0, NULL)) {
                         pipe_mutex_unlock(cache->lock);
                         return NULL;
                 }
@@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
 }
 
 struct vc4_bo *
-vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size,
-                 const char *name)
+vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
 {
-        void *map;
         struct vc4_bo *bo;
+        int ret;
+
+        bo = CALLOC_STRUCT(vc4_bo);
+        if (!bo)
+                return NULL;
+
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->size = align(size, 4096);
+        bo->name = "code";
+        bo->private = false; /* Make sure it doesn't go back to the cache. */
+
+        if (!using_vc4_simulator) {
+                struct drm_vc4_create_shader_bo create = {
+                        .size = size,
+                        .data = (uintptr_t)data,
+                };
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+                               &create);
+                bo->handle = create.handle;
+        } else {
+                struct drm_mode_create_dumb create;
+                memset(&create, 0, sizeof(create));
+
+                create.width = 128;
+                create.bpp = 8;
+                create.height = (size + 127) / 128;
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+                bo->handle = create.handle;
+                assert(create.size >= size);
+
+                vc4_bo_map(bo);
+                memcpy(bo->map, data, size);
+        }
+        if (ret != 0) {
+                fprintf(stderr, "create shader ioctl failure\n");
+                abort();
+        }
+
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
 
-        bo = vc4_bo_alloc(screen, size, name);
-        map = vc4_bo_map(bo);
-        memcpy(map, data, size);
         return bo;
 }
 
@@ -413,63 +455,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         return true;
 }
 
+static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_seqno wait = {
+                .seqno = seqno,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
+}
+
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason)
 {
         if (screen->finished_seqno >= seqno)
                 return true;
 
-        struct drm_vc4_wait_seqno wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.seqno = seqno;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
-        else {
-                wait.seqno = screen->finished_seqno;
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on seqno %lld for %s\n",
+                                (long long)seqno, reason);
+                }
         }
 
-        if (ret == 0) {
-                screen->finished_seqno = wait.seqno;
-                return true;
-        }
+        int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+                return false;
         }
 
-        return false;
+        screen->finished_seqno = seqno;
+        return true;
+}
+
+static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_bo wait = {
+                .handle = handle,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
 }
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason)
 {
         struct vc4_screen *screen = bo->screen;
 
-        struct drm_vc4_wait_bo wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.handle = bo->handle;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
-        else
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on %s BO for %s\n",
+                                bo->name, reason);
+                }
+        }
 
-        if (ret == 0)
-                return true;
+        int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+                return false;
         }
 
-        return false;
+        return true;
 }
 
 void *
@@ -515,7 +585,7 @@ vc4_bo_map(struct vc4_bo *bo)
 {
         void *map = vc4_bo_map_unsynchronized(bo);
 
-        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE);
+        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
         if (!ok) {
                 fprintf(stderr, "BO wait for map failed\n");
                 abort();
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index 7320695ca8e..b77506e242a 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -58,8 +58,8 @@ struct vc4_bo {
 
 struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size,
                             const char *name);
-struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data,
-                                uint32_t size, const char *name);
+struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data,
+                                   uint32_t size);
 void vc4_bo_last_unreference(struct vc4_bo *bo);
 void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
 struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
@@ -113,10 +113,11 @@ void *
 vc4_bo_map_unsynchronized(struct vc4_bo *bo);
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns);
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason);
 
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns);
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason);
 
 void
 vc4_bufmgr_destroy(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 0700e885cbf..ced4f2dfa86 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
 void
 cl_ensure_space(struct vc4_cl *cl, uint32_t space)
 {
-        if ((cl->next - cl->base) + space <= cl->size)
+        uint32_t offset = cl_offset(cl);
+
+        if (offset + space <= cl->size)
                 return;
 
         uint32_t size = MAX2(cl->size + space, cl->size * 2);
-        uint32_t offset = cl->next -cl->base;
 
         cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
         cl->size = size;
@@ -60,15 +61,20 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo)
         uint32_t hindex;
         uint32_t *current_handles = vc4->bo_handles.base;
 
-        for (hindex = 0;
-             hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4;
-             hindex++) {
+        for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) {
                 if (current_handles[hindex] == bo->handle)
                         return hindex;
         }
 
-        cl_u32(&vc4->bo_handles, bo->handle);
-        cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo));
+        struct vc4_cl_out *out;
+
+        out = cl_start(&vc4->bo_handles);
+        cl_u32(&out, bo->handle);
+        cl_end(&vc4->bo_handles, out);
+
+        out = cl_start(&vc4->bo_pointers);
+        cl_ptr(&out, vc4_bo_reference(bo));
+        cl_end(&vc4->bo_pointers, out);
 
         return hindex;
 }
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 4a50e790942..bf4be0efc29 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -33,12 +33,20 @@
 
 struct vc4_bo;
 
+/**
+ * Undefined structure, used for typechecking that you're passing the pointers
+ * to these functions correctly.
+ */
+struct vc4_cl_out;
+
 struct vc4_cl {
         void *base;
-        void *next;
+        struct vc4_cl_out *next;
+        struct vc4_cl_out *reloc_next;
         uint32_t size;
-        uint32_t reloc_next;
+#ifdef DEBUG
         uint32_t reloc_count;
+#endif
 };
 
 void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
@@ -49,135 +57,149 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
 struct PACKED unaligned_16 { uint16_t x; };
 struct PACKED unaligned_32 { uint32_t x; };
 
-static inline void
-put_unaligned_32(void *ptr, uint32_t val)
+static inline uint32_t cl_offset(struct vc4_cl *cl)
 {
-        struct unaligned_32 *p = ptr;
-        p->x = val;
+        return (char *)cl->next - (char *)cl->base;
 }
 
 static inline void
-put_unaligned_16(void *ptr, uint16_t val)
+cl_advance(struct vc4_cl_out **cl, uint32_t n)
 {
-        struct unaligned_16 *p = ptr;
-        p->x = val;
+        (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n);
 }
 
-static inline void
-cl_u8(struct vc4_cl *cl, uint8_t n)
+static inline struct vc4_cl_out *
+cl_start(struct vc4_cl *cl)
 {
-        assert((cl->next - cl->base) + 1 <= cl->size);
-
-        *(uint8_t *)cl->next = n;
-        cl->next++;
+        return cl->next;
 }
 
 static inline void
-cl_u16(struct vc4_cl *cl, uint16_t n)
+cl_end(struct vc4_cl *cl, struct vc4_cl_out *next)
 {
-        assert((cl->next - cl->base) + 2 <= cl->size);
+        cl->next = next;
+        assert(cl_offset(cl) <= cl->size);
+}
 
-        put_unaligned_16(cl->next, n);
-        cl->next += 2;
+
+static inline void
+put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val)
+{
+        struct unaligned_32 *p = (void *)ptr;
+        p->x = val;
 }
 
 static inline void
-cl_u32(struct vc4_cl *cl, uint32_t n)
+put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        struct unaligned_16 *p = (void *)ptr;
+        p->x = val;
+}
 
-        put_unaligned_32(cl->next, n);
-        cl->next += 4;
+static inline void
+cl_u8(struct vc4_cl_out **cl, uint8_t n)
+{
+        *(uint8_t *)(*cl) = n;
+        cl_advance(cl, 1);
 }
 
 static inline void
-cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
+cl_u16(struct vc4_cl_out **cl, uint16_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        put_unaligned_16(*cl, n);
+        cl_advance(cl, 2);
+}
 
-        *(uint32_t *)cl->next = n;
-        cl->next += 4;
+static inline void
+cl_u32(struct vc4_cl_out **cl, uint32_t n)
+{
+        put_unaligned_32(*cl, n);
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_ptr(struct vc4_cl *cl, void *ptr)
+cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
+        *(uint32_t *)(*cl) = n;
+        cl_advance(cl, 4);
+}
 
-        *(void **)cl->next = ptr;
-        cl->next += sizeof(void *);
+static inline void
+cl_ptr(struct vc4_cl_out **cl, void *ptr)
+{
+        *(struct vc4_cl_out **)(*cl) = ptr;
+        cl_advance(cl, sizeof(void *));
 }
 
 static inline void
-cl_f(struct vc4_cl *cl, float f)
+cl_f(struct vc4_cl_out **cl, float f)
 {
         cl_u32(cl, fui(f));
 }
 
 static inline void
-cl_aligned_f(struct vc4_cl *cl, float f)
+cl_aligned_f(struct vc4_cl_out **cl, float f)
 {
         cl_aligned_u32(cl, fui(f));
 }
 
 static inline void
-cl_start_reloc(struct vc4_cl *cl, uint32_t n)
+cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
+#endif
 
-        cl_u8(cl, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = cl->next - cl->base;
-        cl_u32(cl, 0); /* Space where hindex will be written. */
-        cl_u32(cl, 0); /* Space where hindex will be written. */
+        cl_u8(out, VC4_PACKET_GEM_HANDLES);
+        cl->reloc_next = *out;
+        cl_u32(out, 0); /* Space where hindex will be written. */
+        cl_u32(out, 0); /* Space where hindex will be written. */
 }
 
-static inline void
+static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
-        cl->reloc_next = cl->next - cl->base;
+#endif
+        cl->reloc_next = cl->next;
+
+        /* Reserve the space where hindex will be written. */
+        cl_advance(&cl->next, n * 4);
 
-        /* Space where hindex will be written. */
-        cl->next += n * 4;
+        return cl->next;
 }
 
 static inline void
-cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
+         struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
-        cl_u32(cl, offset);
+        cl_u32(cl_out, offset);
 }
 
 static inline void
-cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+                 struct vc4_cl_out **cl_out,
+                 struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
-        cl_aligned_u32(cl, offset);
-}
-
-static inline void
-cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
-}
-
-static inline void
-cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
+        cl_aligned_u32(cl_out, offset);
 }
 
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 69055081daa..6d748010baf 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset)
         void *f = cl + offset;
 
         fprintf(stderr, "0x%08x 0x%08x:      %f (0x%08x)\n",
-                offset, hw_offset, *(float *)f, *(uint32_t *)f);
+                offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f);
 }
 
 static void
@@ -47,7 +47,33 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset
 }
 
 static void
-dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t bits = *(uint32_t *)(cl + offset);
+
+        fprintf(stderr, "0x%08x 0x%08x:      addr 0x%08x%s%s%s%s\n",
+                offset, hw_offset,
+                bits & ~0xf,
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color",
+                (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : "");
+}
+
+static void
+dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint8_t *bytes = cl + offset;
         uint32_t *addr = cl + offset + 2;
@@ -125,6 +151,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw
 }
 
 static void
+dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *bits = cl + offset;
@@ -291,63 +329,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset)
                 offset, hw_offset, handles[0], handles[1]);
 }
 
-#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name }
-#define PACKET(name, size) [name] = { #name, size, NULL }
+#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name }
+#define PACKET(name) [name] = { #name, name ## _SIZE, NULL }
 
 static const struct packet_info {
         const char *name;
         uint8_t size;
         void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset);
 } packet_info[] = {
-        PACKET(VC4_PACKET_HALT, 1),
-        PACKET(VC4_PACKET_NOP, 1),
-
-        PACKET(VC4_PACKET_FLUSH, 1),
-        PACKET(VC4_PACKET_FLUSH_ALL, 1),
-        PACKET(VC4_PACKET_START_TILE_BINNING, 1),
-        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1),
-        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1),
-
-        PACKET(VC4_PACKET_BRANCH, 5),
-        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5),
-
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1),
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1),
-        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5),
-        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5),
-        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7),
-        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7),
-
-        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10),
-
-        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48),
-        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49),
-
-        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2),
-
-        PACKET(VC4_PACKET_GL_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_NV_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_VG_SHADER_STATE, 5),
-
-        PACKET(VC4_PACKET_CONFIGURATION_BITS, 4),
-        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5),
-        PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5),
-        PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5),
-        PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3),
-        PACKET(VC4_PACKET_DEPTH_OFFSET, 5),
-        PACKET(VC4_PACKET_CLIP_WINDOW, 9),
-        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5),
-        PACKET(VC4_PACKET_Z_CLIPPING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
-
-        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
-        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
-        PACKET(VC4_PACKET_CLEAR_COLORS, 14),
-        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
-
-        PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9),
+        PACKET(VC4_PACKET_HALT),
+        PACKET(VC4_PACKET_NOP),
+
+        PACKET(VC4_PACKET_FLUSH),
+        PACKET(VC4_PACKET_FLUSH_ALL),
+        PACKET(VC4_PACKET_START_TILE_BINNING),
+        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE),
+        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE),
+
+        PACKET(VC4_PACKET_BRANCH),
+        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST),
+
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF),
+        PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
+        PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
+
+        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
+        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
+
+        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
+        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
+
+        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT),
+
+        PACKET(VC4_PACKET_GL_SHADER_STATE),
+        PACKET(VC4_PACKET_NV_SHADER_STATE),
+        PACKET(VC4_PACKET_VG_SHADER_STATE),
+
+        PACKET(VC4_PACKET_CONFIGURATION_BITS),
+        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS),
+        PACKET_DUMP(VC4_PACKET_POINT_SIZE),
+        PACKET_DUMP(VC4_PACKET_LINE_WIDTH),
+        PACKET(VC4_PACKET_RHT_X_BOUNDARY),
+        PACKET(VC4_PACKET_DEPTH_OFFSET),
+        PACKET(VC4_PACKET_CLIP_WINDOW),
+        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET),
+        PACKET(VC4_PACKET_Z_CLIPPING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING),
+
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG),
+        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG),
+        PACKET(VC4_PACKET_CLEAR_COLORS),
+        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES),
+
+        PACKET_DUMP(VC4_PACKET_GEM_HANDLES),
 };
 
 void
@@ -359,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         while (offset < size) {
                 uint8_t header = cmds[offset];
 
-                if (header > ARRAY_SIZE(packet_info) ||
+                if (header >= ARRAY_SIZE(packet_info) ||
                     !packet_info[header].name) {
                         fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
                                 offset, hw_offset, header, header);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 630f8e68896..fff63158c9d 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx)
          * FLUSH completes.
          */
         cl_ensure_space(&vc4->bcl, 8);
-        cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
+        cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
-        cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
+        cl_u8(&bcl, VC4_PACKET_FLUSH);
+        cl_end(&vc4->bcl, bcl);
 
         if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                 pipe_surface_reference(&vc4->color_write, cbuf);
@@ -103,8 +105,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
         vc4_flush(pctx);
 
         if (fence) {
+                struct pipe_screen *screen = pctx->screen;
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
                                                        vc4->last_emit_seqno);
+                screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
 }
@@ -126,8 +130,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo)
          * they match.
          */
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 if (referenced_bos[i] == bo) {
                         return true;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d5d6be16f6e..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -67,7 +67,20 @@
 #define VC4_DIRTY_CLIP          (1 << 20)
 #define VC4_DIRTY_UNCOMPILED_VS (1 << 21)
 #define VC4_DIRTY_UNCOMPILED_FS (1 << 22)
-#define VC4_DIRTY_COMPILED_FS   (1 << 24)
+#define VC4_DIRTY_COMPILED_CS   (1 << 23)
+#define VC4_DIRTY_COMPILED_VS   (1 << 24)
+#define VC4_DIRTY_COMPILED_FS   (1 << 25)
+
+struct vc4_sampler_view {
+        struct pipe_sampler_view base;
+        uint32_t texture_p0;
+        uint32_t texture_p1;
+};
+
+struct vc4_sampler_state {
+        struct pipe_sampler_state base;
+        uint32_t texture_p1;
+};
 
 struct vc4_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
@@ -121,6 +134,12 @@ struct vc4_compiled_shader {
         struct vc4_ubo_range *ubo_ranges;
         uint32_t num_ubo_ranges;
         uint32_t ubo_size;
+        /**
+         * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the
+         * uniforms have to be rewritten (and therefore the shader state
+         * reemitted).
+         */
+        uint32_t uniform_dirty_bits;
 
         /** bitmask of which inputs are color inputs, for flat shade handling. */
         uint32_t color_inputs;
@@ -238,6 +257,11 @@ struct vc4_context {
          */
         bool draw_call_queued;
 
+        /** Maximum index buffer valid for the current shader_rec. */
+        uint32_t max_index;
+        /** Last index bias baked into the current shader_rec. */
+        uint32_t last_index_bias;
+
         struct primconvert_context *primconvert;
 
         struct hash_table *fs_cache, *vs_cache;
@@ -246,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
         uint8_t prim_mode;
@@ -326,6 +351,18 @@ vc4_context(struct pipe_context *pcontext)
         return (struct vc4_context *)pcontext;
 }
 
+static inline struct vc4_sampler_view *
+vc4_sampler_view(struct pipe_sampler_view *psview)
+{
+        return (struct vc4_sampler_view *)psview;
+}
+
+static inline struct vc4_sampler_state *
+vc4_sampler_state(struct pipe_sampler_state *psampler)
+{
+        return (struct vc4_sampler_state *)psampler;
+}
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv);
 void vc4_draw_init(struct pipe_context *pctx);
@@ -337,6 +374,7 @@ void vc4_simulator_init(struct vc4_screen *screen);
 int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args);
 
+void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
 void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_compiled_shader *shader,
                         struct vc4_constbuf_stateobj *cb,
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 5e6d70d6f33..a4e5e092b1a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t height = vc4->framebuffer.height;
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
 
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&vc4->bcl, tilew);
-        cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
+        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
+        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
+        cl_u8(&bcl, tilew);
+        cl_u8(&bcl, tileh);
+        cl_u8(&bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
          * figure out what new state packets need to be written to that tile's
          * command list.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING);
+        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);
 
         /* Reset the current compressed primitives format.  This gets modified
          * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
          * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
          * of every tile.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
-        cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
-                          VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
+        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
+        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
+                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
         vc4->draw_width = width;
         vc4->draw_height = height;
+
+        cl_end(&vc4->bcl, bcl);
 }
 
 static void
@@ -119,96 +122,67 @@ vc4_update_shadow_textures(struct pipe_context *pctx,
 }
 
 static void
-vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info)
 {
-        struct vc4_context *vc4 = vc4_context(pctx);
-
-        if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
-                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
-                util_primconvert_draw_vbo(vc4->primconvert, info);
-                perf_debug("Fallback conversion for %d %s vertices\n",
-                           info->count, u_prim_name(info->mode));
-                return;
-        }
-
-        /* Before setting up the draw, do any fixup blits necessary. */
-        vc4_update_shadow_textures(pctx, &vc4->verttex);
-        vc4_update_shadow_textures(pctx, &vc4->fragtex);
-
-        vc4_get_draw_cl_space(vc4);
-
+        /* VC4_DIRTY_VTXSTATE */
         struct vc4_vertex_stateobj *vtx = vc4->vtx;
+        /* VC4_DIRTY_VTXBUF */
         struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
 
-        if (vc4->prim_mode != info->mode) {
-                vc4->prim_mode = info->mode;
-                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
-        }
-
-        vc4_start_draw(vc4);
-        vc4_update_compiled_shaders(vc4, info->mode);
-
-        vc4_emit_state(pctx);
-        vc4->dirty = 0;
-
-        vc4_write_uniforms(vc4, vc4->prog.fs,
-                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
-                           &vc4->fragtex);
-        vc4_write_uniforms(vc4, vc4->prog.vs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-        vc4_write_uniforms(vc4, vc4->prog.cs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-
         /* The simulator throws a fit if VS or CS don't read an attribute, so
          * we emit a dummy read.
          */
         uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
         /* Emit the shader record. */
-        cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
-        cl_u16(&vc4->shader_rec,
+        struct vc4_cl_out *shader_rec =
+                cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
+        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
+        cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
+               VC4_SHADER_FLAG_FS_SINGLE_THREAD |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
-        cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
-        cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_FS */
+        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
+        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_VS */
+        cl_u16(&shader_rec, 0); /* vs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_CS */
+        cl_u16(&shader_rec, 0); /* cs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         uint32_t max_index = 0xffff;
-        uint32_t vpm_offset = 0;
         for (int i = 0; i < vtx->num_elements; i++) {
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
                 struct vc4_resource *rsc = vc4_resource(vb->buffer);
-                uint32_t offset = vb->buffer_offset + elem->src_offset;
+                /* not vc4->dirty tracked: vc4->last_index_bias */
+                uint32_t offset = (vb->buffer_offset +
+                                   elem->src_offset +
+                                   vb->stride * info->index_bias);
                 uint32_t vb_size = rsc->bo->size - offset;
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);
 
-                cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset);
-                cl_u8(&vc4->shader_rec, elem_size - 1);
-                cl_u8(&vc4->shader_rec, vb->stride);
-                cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]);
-                cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]);
-
-                vpm_offset += align(elem_size, 4);
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset);
+                cl_u8(&shader_rec, elem_size - 1);
+                cl_u8(&shader_rec, vb->stride);
+                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
+                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
 
                 if (vb->stride > 0) {
                         max_index = MIN2(max_index,
@@ -219,25 +193,89 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         if (vtx->num_elements == 0) {
                 assert(num_elements_emit == 1);
                 struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
-                cl_reloc(vc4, &vc4->shader_rec, bo, 0);
-                cl_u8(&vc4->shader_rec, 16 - 1); /* element size */
-                cl_u8(&vc4->shader_rec, 0); /* stride */
-                cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */
-                cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0);
+                cl_u8(&shader_rec, 16 - 1); /* element size */
+                cl_u8(&shader_rec, 0); /* stride */
+                cl_u8(&shader_rec, 0); /* VS VPM offset */
+                cl_u8(&shader_rec, 0); /* CS VPM offset */
                 vc4_bo_unreference(&bo);
         }
+        cl_end(&vc4->shader_rec, shader_rec);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         /* the actual draw call. */
-        cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE);
+        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
         assert(vtx->num_elements <= 8);
         /* Note that number of attributes == 0 in the packet means 8
          * attributes.  This field also contains the offset into shader_rec.
          */
-        cl_u32(&vc4->bcl, num_elements_emit & 0x7);
+        cl_u32(&bcl, num_elements_emit & 0x7);
+        cl_end(&vc4->bcl, bcl);
+
+        vc4_write_uniforms(vc4, vc4->prog.fs,
+                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
+                           &vc4->fragtex);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+        vc4_write_uniforms(vc4, vc4->prog.cs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+
+        vc4->last_index_bias = info->index_bias;
+        vc4->max_index = max_index;
+}
+
+static void
+vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (info->mode >= PIPE_PRIM_QUADS) {
+                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
+                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
+                util_primconvert_draw_vbo(vc4->primconvert, info);
+                perf_debug("Fallback conversion for %d %s vertices\n",
+                           info->count, u_prim_name(info->mode));
+                return;
+        }
+
+        /* Before setting up the draw, do any fixup blits necessary. */
+        vc4_update_shadow_textures(pctx, &vc4->verttex);
+        vc4_update_shadow_textures(pctx, &vc4->fragtex);
+
+        vc4_get_draw_cl_space(vc4);
+
+        if (vc4->prim_mode != info->mode) {
+                vc4->prim_mode = info->mode;
+                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
+        }
+
+        vc4_start_draw(vc4);
+        vc4_update_compiled_shaders(vc4, info->mode);
+
+        vc4_emit_state(pctx);
+
+        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
+                           VC4_DIRTY_VTXSTATE |
+                           VC4_DIRTY_PRIM_MODE |
+                           VC4_DIRTY_RASTERIZER |
+                           VC4_DIRTY_COMPILED_CS |
+                           VC4_DIRTY_COMPILED_VS |
+                           VC4_DIRTY_COMPILED_FS |
+                           vc4->prog.cs->uniform_dirty_bits |
+                           vc4->prog.vs->uniform_dirty_bits |
+                           vc4->prog.fs->uniform_dirty_bits)) ||
+            vc4->last_index_bias != info->index_bias) {
+                vc4_emit_gl_shader_state(vc4, info);
+        }
+
+        vc4->dirty = 0;
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
          */
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (info->indexed) {
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
@@ -251,25 +289,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
-                cl_start_reloc(&vc4->bcl, 1);
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
-                cl_u8(&vc4->bcl,
+                cl_start_reloc(&vc4->bcl, &bcl, 1);
+                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
+                cl_u8(&bcl,
                       info->mode |
                       (index_size == 2 ?
                        VC4_INDEX_BUFFER_U16:
                        VC4_INDEX_BUFFER_U8));
-                cl_u32(&vc4->bcl, info->count);
-                cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
-                cl_u32(&vc4->bcl, max_index);
+                cl_u32(&bcl, info->count);
+                cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
+                cl_u32(&bcl, vc4->max_index);
 
                 if (vc4->indexbuf.index_size == 4)
                         pipe_resource_reference(&prsc, NULL);
         } else {
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                cl_u8(&vc4->bcl, info->mode);
-                cl_u32(&vc4->bcl, info->count);
-                cl_u32(&vc4->bcl, info->start);
+                cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
+                cl_u8(&bcl, info->mode);
+                cl_u32(&bcl, info->count);
+                cl_u32(&bcl, info->start);
         }
+        cl_end(&vc4->bcl, bcl);
 
         if (vc4->zsa && vc4->zsa->base.depth.enabled) {
                 vc4->resolve |= PIPE_CLEAR_DEPTH;
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 5f1ee4fa125..863ef8da8fb 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -31,12 +31,14 @@
 #define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
 #define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -183,6 +185,29 @@ struct drm_vc4_create_bo {
 };
 
 /**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
+/**
  * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
  *
  * This doesn't actually perform an mmap.  Instead, it returns the
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index d2b54fccf91..ba064ff889b 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -28,23 +28,24 @@ vc4_emit_state(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
                 float *vpscale = vc4->viewport.scale;
                 float *vptranslate = vc4->viewport.translate;
-                float vp_minx = -fabs(vpscale[0]) + vptranslate[0];
-                float vp_maxx = fabs(vpscale[0]) + vptranslate[0];
-                float vp_miny = -fabs(vpscale[1]) + vptranslate[1];
-                float vp_maxy = fabs(vpscale[1]) + vptranslate[1];
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
                 uint32_t minx = MAX2(vc4->scissor.minx, vp_minx);
                 uint32_t miny = MAX2(vc4->scissor.miny, vp_miny);
                 uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
                 uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW);
-                cl_u16(&vc4->bcl, minx);
-                cl_u16(&vc4->bcl, miny);
-                cl_u16(&vc4->bcl, maxx - minx);
-                cl_u16(&vc4->bcl, maxy - miny);
+                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
+                cl_u16(&bcl, minx);
+                cl_u16(&bcl, miny);
+                cl_u16(&bcl, maxx - minx);
+                cl_u16(&bcl, maxy - miny);
 
                 vc4->draw_min_x = MIN2(vc4->draw_min_x, minx);
                 vc4->draw_min_y = MIN2(vc4->draw_min_y, miny);
@@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx)
         }
 
         if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[0] |
                       vc4->zsa->config_bits[0]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[1] |
                       vc4->zsa->config_bits[1]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[2] |
                       vc4->zsa->config_bits[2]);
         }
 
         if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
-                cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_units);
+                cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET);
+                cl_u16(&bcl, vc4->rasterizer->offset_factor);
+                cl_u16(&bcl, vc4->rasterizer->offset_units);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE);
-                cl_f(&vc4->bcl, vc4->rasterizer->point_size);
+                cl_u8(&bcl, VC4_PACKET_POINT_SIZE);
+                cl_f(&bcl, vc4->rasterizer->point_size);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH);
-                cl_f(&vc4->bcl, vc4->rasterizer->base.line_width);
+                cl_u8(&bcl, VC4_PACKET_LINE_WIDTH);
+                cl_f(&bcl, vc4->rasterizer->base.line_width);
         }
 
         if (vc4->dirty & VC4_DIRTY_VIEWPORT) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f);
-                cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING);
+                cl_f(&bcl, vc4->viewport.scale[0] * 16.0f);
+                cl_f(&bcl, vc4->viewport.scale[1] * 16.0f);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.translate[2]);
-                cl_f(&vc4->bcl, vc4->viewport.scale[2]);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING);
+                cl_f(&bcl, vc4->viewport.translate[2]);
+                cl_f(&bcl, vc4->viewport.scale[2]);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]);
+                cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[0]);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[1]);
         }
 
         if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) {
-                cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
-                cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ?
+                cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
+                cl_u32(&bcl, vc4->rasterizer->base.flatshade ?
                        vc4->prog.fs->color_inputs : 0);
         }
+
+        cl_end(&vc4->bcl, bcl);
 }
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f2ee91de61a..b6fb2a8a460 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -60,16 +60,6 @@ vc4_fence_reference(struct pipe_screen *pscreen,
 }
 
 static boolean
-vc4_fence_signalled(struct pipe_screen *pscreen,
-                    struct pipe_fence_handle *pf)
-{
-        struct vc4_screen *screen = vc4_screen(pscreen);
-        struct vc4_fence *f = (struct vc4_fence *)pf;
-
-        return vc4_wait_seqno(screen, f->seqno, 0);
-}
-
-static boolean
 vc4_fence_finish(struct pipe_screen *pscreen,
                  struct pipe_fence_handle *pf,
                  uint64_t timeout_ns)
@@ -77,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen,
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_fence *f = (struct vc4_fence *)pf;
 
-        return vc4_wait_seqno(screen, f->seqno, timeout_ns);
+        return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait");
 }
 
 struct vc4_fence *
@@ -98,6 +88,5 @@ void
 vc4_fence_init(struct vc4_screen *screen)
 {
         screen->base.fence_reference = vc4_fence_reference;
-        screen->base.fence_signalled = vc4_fence_signalled;
         screen->base.fence_finish = vc4_fence_finish;
 }
diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c
index 004bac70c67..ffce61237de 100644
--- a/src/gallium/drivers/vc4/vc4_formats.c
+++ b/src/gallium/drivers/vc4/vc4_formats.c
@@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = {
 static const struct vc4_format *
 get_format(enum pipe_format f)
 {
-        if (f > ARRAY_SIZE(vc4_format_table) ||
+        if (f >= ARRAY_SIZE(vc4_format_table) ||
             !vc4_format_table[f].present)
                 return NULL;
         else
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index dcade15443a..7ebd9f160eb 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -44,8 +44,7 @@ void
 vc4_job_reset(struct vc4_context *vc4)
 {
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
@@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4)
 {
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
-                vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
+                vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false);
         }
 
         struct drm_vc4_submit_cl submit;
@@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4)
                                      vc4->zs_write, true, true);
 
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
-        submit.bo_handle_count = (vc4->bo_handles.next -
-                                  vc4->bo_handles.base) / 4;
+        submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
-        submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
+        submit.bin_cl_size = cl_offset(&vc4->bcl);
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
-        submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
+        submit.shader_rec_size = cl_offset(&vc4->shader_rec);
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
-        submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
+        submit.uniforms_size = cl_offset(&vc4->uniforms);
 
         assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
         submit.min_x_tile = vc4->draw_min_x / 64;
@@ -207,7 +205,7 @@ vc4_job_submit(struct vc4_context *vc4)
 
         if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                 if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
-                                    PIPE_TIMEOUT_INFINITE)) {
+                                    PIPE_TIMEOUT_INFINITE, "sync")) {
                         fprintf(stderr, "Wait failed.\n");
                         abort();
                 }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
new file mode 100644
index 00000000000..a372a6c0cdc
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements most of the fixed function fragment pipeline in shader code.
+ *
+ * VC4 doesn't have any hardware support for blending, alpha test, logic ops,
+ * or color mask.  Instead, you read the current contents of the destination
+ * from the tile buffer after having waited for the scoreboard (which is
+ * handled by vc4_qpu_emit.c), then do math using your output color and that
+ * destination value, and update the output color appropriately.
+ */
+
+/**
+ * Lowers fixed-function blending to a load of the destination color and a
+ * series of ALU operations before the store of the output.
+ */
+#include "util/u_format.h"
+#include "vc4_qir.h"
+#include "glsl/nir/nir_builder.h"
+#include "vc4_context.h"
+
+/** Emits a load of the previous fragment color from the tile buffer. */
+static nir_ssa_def *
+vc4_nir_get_dst_color(nir_builder *b)
+{
+        nir_intrinsic_instr *load =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_input);
+        load->num_components = 1;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+        return &load->dest.ssa;
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
+{
+        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
+        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
+        nir_ssa_def *high = nir_fpow(b,
+                                     nir_fmul(b,
+                                              nir_fadd(b, srgb,
+                                                       nir_imm_float(b, 0.055)),
+                                              nir_imm_float(b, 1.0 / 1.055)),
+                                     nir_imm_float(b, 2.4));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
+{
+        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
+        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
+        nir_ssa_def *high = nir_fsub(b,
+                                     nir_fmul(b,
+                                              nir_imm_float(b, 1.055),
+                                              nir_fpow(b,
+                                                       linear,
+                                                       nir_imm_float(b, 0.41666))),
+                                     nir_imm_float(b, 0.055));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static nir_ssa_def *
+vc4_blend_channel(nir_builder *b,
+                  nir_ssa_def **src,
+                  nir_ssa_def **dst,
+                  unsigned factor,
+                  int channel)
+{
+        switch(factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_float(b, 1.0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src[3];
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst[3];
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                if (channel != 3) {
+                        return nir_fmin(b,
+                                        src[3],
+                                        nir_fsub(b,
+                                                 nir_imm_float(b, 1.0),
+                                                 dst[3]));
+                } else {
+                        return nir_imm_float(b, 1.0);
+                }
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_float(b, 0.0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[3]);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_float(b, 1.0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+               unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_fadd(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_fsub(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_fsub(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_fmin(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_fmax(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
+static void
+vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable) {
+                for (int i = 0; i < 4; i++)
+                        result[i] = src_color[i];
+                return;
+        }
+
+        /* Clamp the src color to [0, 1].  Dest is already clamped. */
+        for (int i = 0; i < 4; i++)
+                src_color[i] = nir_fsat(b, src_color[i]);
+
+        nir_ssa_def *src_blend[4], *dst_blend[4];
+        for (int i = 0; i < 4; i++) {
+                int src_factor = ((i != 3) ? blend->rgb_src_factor :
+                                  blend->alpha_src_factor);
+                int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
+                                  blend->alpha_dst_factor);
+                src_blend[i] = nir_fmul(b, src_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          src_factor, i));
+                dst_blend[i] = nir_fmul(b, dst_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          dst_factor, i));
+        }
+
+        for (int i = 0; i < 4; i++) {
+                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
+                                           ((i != 3) ? blend->rgb_func :
+                                            blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_pipe_compare_func(nir_builder *b, int func,
+                          nir_ssa_def *src0, nir_ssa_def *src1)
+{
+        switch (func) {
+        default:
+                fprintf(stderr, "Unknown compare func %d\n", func);
+                /* FALLTHROUGH */
+        case PIPE_FUNC_NEVER:
+                return nir_imm_int(b, 0);
+        case PIPE_FUNC_ALWAYS:
+                return nir_imm_int(b, ~0);
+        case PIPE_FUNC_EQUAL:
+                return nir_feq(b, src0, src1);
+        case PIPE_FUNC_NOTEQUAL:
+                return nir_fne(b, src0, src1);
+        case PIPE_FUNC_GREATER:
+                return nir_flt(b, src1, src0);
+        case PIPE_FUNC_GEQUAL:
+                return nir_fge(b, src0, src1);
+        case PIPE_FUNC_LESS:
+                return nir_flt(b, src0, src1);
+        case PIPE_FUNC_LEQUAL:
+                return nir_fge(b, src1, src0);
+        }
+}
+
+static void
+vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
+                                nir_ssa_def *alpha)
+{
+        if (!c->fs_key->alpha_test)
+                return;
+
+        nir_ssa_def *alpha_ref =
+                vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF);
+        nir_ssa_def *condition =
+                vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func,
+                                          alpha, alpha_ref);
+
+        nir_intrinsic_instr *discard =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_discard_if);
+        discard->num_components = 1;
+        discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
+        nir_builder_instr_insert(b, &discard->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        /* Pull out the float src/dst color components. */
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
+        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
+                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+        }
+
+        /* Unswizzle the destination color. */
+        nir_ssa_def *dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                            unpacked_dst_color,
+                                                            format_swiz[i]);
+        }
+
+        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
+
+        /* Turn dst color to linear. */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+        }
+
+        nir_ssa_def *blend_color[4];
+        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+
+        /* sRGB encode the output color */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+        }
+
+        nir_ssa_def *swizzled_outputs[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled_outputs[i] =
+                        vc4_nir_get_swizzled_channel(b, blend_color,
+                                                     format_swiz[i]);
+        }
+
+        nir_ssa_def *packed_color =
+                nir_pack_unorm_4x8(b,
+                                   nir_vec4(b,
+                                            swizzled_outputs[0],
+                                            swizzled_outputs[1],
+                                            swizzled_outputs[2],
+                                            swizzled_outputs[3]));
+
+        packed_color = vc4_logicop(b, c->fs_key->logicop_func,
+                                   packed_color, packed_dst_color);
+
+        /* If the bit isn't set in the color mask, then just return the
+         * original dst color, instead.
+         */
+        uint32_t colormask = 0xffffffff;
+        for (int i = 0; i < 4; i++) {
+                if (format_swiz[i] < 4 &&
+                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
+                        colormask &= ~(0xff << (i * 8));
+                }
+        }
+        packed_color = nir_ior(b,
+                               nir_iand(b, packed_color,
+                                        nir_imm_int(b, colormask)),
+                               nir_iand(b, packed_dst_color,
+                                        nir_imm_int(b, ~colormask)));
+
+        /* Turn the old vec4 output into a store of the packed color. */
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(packed_color));
+        intr->num_components = 1;
+}
+
+static bool
+vc4_nir_lower_blend_block(nir_block *block, void *state)
+{
+        struct vc4_compile *c = state;
+
+        nir_foreach_instr(block, instr) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_variable *output_var = NULL;
+                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                        if (var->data.driver_location == intr->const_index[0]) {
+                                output_var = var;
+                                break;
+                        }
+                }
+                assert(output_var);
+                unsigned semantic_name = output_var->data.location;
+
+                if (semantic_name != TGSI_SEMANTIC_COLOR)
+                        continue;
+
+                nir_function_impl *impl =
+                        nir_cf_node_get_function(&block->cf_node);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                nir_builder_insert_before_instr(&b, &intr->instr);
+                vc4_nir_lower_blend_instr(c, &b, intr);
+        }
+        return true;
+}
+
+void
+vc4_nir_lower_blend(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl) {
+                        nir_foreach_block(overload->impl,
+                                          vc4_nir_lower_blend_block, c);
+
+                        nir_metadata_preserve(overload->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
new file mode 100644
index 00000000000..229d41147d8
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
+ * something amenable to the VC4 architecture.
+ *
+ * Currently, it split inputs, outputs, and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.
+ */
+
+static void
+replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
+                            nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vec4.  This will get split by the
+         * later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vec4.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec),
+                                 ralloc_parent(b->impl));
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
+            VC4_NIR_TLB_COLOR_READ_INPUT) {
+                /* This doesn't need any lowering. */
+                return;
+        }
+
+        nir_variable *input_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        input_var = var;
+                        break;
+                }
+        }
+        assert(input_var);
+        int semantic_name = input_var->data.location;
+        int semantic_index = input_var->data.index;
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                dests[i] = &intr_comp->dest.ssa;
+        }
+
+        switch (c->stage) {
+        case QSTAGE_FRAG:
+                switch (semantic_name) {
+                case TGSI_SEMANTIC_FACE:
+                        dests[0] = nir_fsub(b,
+                                            nir_imm_float(b, 1.0),
+                                            nir_fmul(b,
+                                                     nir_i2f(b, dests[0]),
+                                                     nir_imm_float(b, 2.0)));
+                        dests[1] = nir_imm_float(b, 0.0);
+                        dests[2] = nir_imm_float(b, 0.0);
+                        dests[3] = nir_imm_float(b, 1.0);
+                        break;
+                case TGSI_SEMANTIC_GENERIC:
+                        if (c->fs_key->point_sprite_mask &
+                            (1 << semantic_index)) {
+                                if (!c->fs_key->is_points) {
+                                        dests[0] = nir_imm_float(b, 0.0);
+                                        dests[1] = nir_imm_float(b, 0.0);
+                                }
+                                if (c->fs_key->point_coord_upper_left) {
+                                        dests[1] = nir_fsub(b,
+                                                            nir_imm_float(b, 1.0),
+                                                            dests[1]);
+                                }
+                                dests[2] = nir_imm_float(b, 0.0);
+                                dests[3] = nir_imm_float(b, 1.0);
+                        }
+                        break;
+                }
+                break;
+        case QSTAGE_COORD:
+        case QSTAGE_VERT:
+                break;
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+        unsigned semantic_name = output_var->data.location;
+
+        if (c->stage == QSTAGE_COORD &&
+            (semantic_name != TGSI_SEMANTIC_POSITION &&
+             semantic_name != TGSI_SEMANTIC_PSIZE)) {
+                nir_instr_remove(&intr->instr);
+                return;
+        }
+
+        /* Color output is lowered by vc4_nir_lower_blend(). */
+        if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) {
+                intr->const_index[0] *= 4;
+                return;
+        }
+
+        /* All TGSI-to-NIR outputs are VEC4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+
+                assert(intr->src[0].is_ssa);
+                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
+                                                                intr->src[0].ssa,
+                                                                &i, 1, false));
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        /* All TGSI-to-NIR uniform loads are vec4, but we may create dword
+         * loads in our lowering passes.
+         */
+        if (intr->num_components == 1)
+                return;
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+
+                if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) {
+                        /* Convert the variable TGSI register index to a byte
+                         * offset.
+                         */
+                        intr_comp->src[0] =
+                                nir_src_for_ssa(nir_ishl(b,
+                                                         intr->src[0].ssa,
+                                                         nir_imm_int(b, 4)));
+
+                        /* Convert the offset to be a byte index, too. */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 16 +
+                                                     i * 4);
+                } else {
+                        /* We want a dword index for non-indirect uniform
+                         * loads.
+                         */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 4 +
+                                                     i);
+                }
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                vc4_nir_lower_input(c, b, intr);
+                break;
+
+        case nir_intrinsic_store_output:
+                vc4_nir_lower_output(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_uniform_indirect:
+                vc4_nir_lower_uniform(c, b, intr);
+                break;
+
+        default:
+                break;
+        }
+}
+
+static bool
+vc4_nir_lower_io_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr)
+                vc4_nir_lower_io_instr(c, &b, instr);
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_io_block, c);
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_io(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_io_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 
                 if (inst->op == QOP_MOV &&
                     inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM &&
-                    !(inst->src[0].file == QFILE_TEMP &&
-                      (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
-                       c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+                    inst->src[0].file != QFILE_VPM) {
                         movs[inst->dst.index] = inst->src[0];
                 }
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 92c8260eb59..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
         struct qreg src[4];
         /**
          * If the instruction depends on the flags, how many SFs have been
-         * seen before this instruction, or if it depends on r4, how many r4
-         * writes have been seen.
+         * seen before this instruction.
          */
         uint32_t implicit_arg_update_count;
 };
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
 
 static struct qinst *
 vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
-             struct qinst *inst, uint32_t sf_count,
-             uint32_t r4_count)
+             struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
             inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
                qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
         if (qir_depends_on_flags(inst))
                 key.implicit_arg_update_count = sf_count;
-        if (qir_reads_r4(inst))
-                key.implicit_arg_update_count = r4_count;
 
         uint32_t hash = _mesa_hash_data(&key, sizeof(key));
         struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        uint32_t sf_count = 0, r4_count = 0;
+        uint32_t sf_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                         inst_key_equals);
@@ -130,15 +126,15 @@ qir_opt_cse(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_TLB_COLOR_READ) {
                         continue;
                 }
 
                 if (inst->sf) {
                         sf_count++;
                 } else {
-                        struct qinst *cse = vc4_find_cse(c, ht, inst,
-                                                         sf_count, r4_count);
+                        struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
                         if (cse) {
                                 inst->src[0] = cse->dst;
                                 for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -154,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
                                 }
                         }
                 }
-
-                if (qir_writes_r4(inst))
-                        r4_count++;
         }
 
         ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ba47c51d9bd..13c472152d8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -23,21 +23,19 @@
  */
 
 #include <inttypes.h>
-#include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_hash.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pack_color.h"
-#include "util/format_srgb.h"
 #include "util/ralloc.h"
 #include "util/hash_table.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_parse.h"
+#include "glsl/nir/nir.h"
+#include "glsl/nir/nir_builder.h"
 #include "nir/tgsi_to_nir.h"
-
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
@@ -45,51 +43,8 @@
 #include "simpenrose/simpenrose.h"
 #endif
 
-struct vc4_key {
-        struct vc4_uncompiled_shader *shader_state;
-        struct {
-                enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
-                uint8_t swizzle[4];
-        } tex[VC4_MAX_TEXTURE_SAMPLERS];
-        uint8_t ucp_enables;
-};
-
-struct vc4_fs_key {
-        struct vc4_key base;
-        enum pipe_format color_format;
-        bool depth_enabled;
-        bool stencil_enabled;
-        bool stencil_twoside;
-        bool stencil_full_writemasks;
-        bool is_points;
-        bool is_lines;
-        bool alpha_test;
-        bool point_coord_upper_left;
-        bool light_twoside;
-        uint8_t alpha_test_func;
-        uint8_t logicop_func;
-        uint32_t point_sprite_mask;
-
-        struct pipe_rt_blend_state blend;
-};
-
-struct vc4_vs_key {
-        struct vc4_key base;
-
-        /**
-         * This is a proxy for the array of FS input semantics, which is
-         * larger than we would want to put in the key.
-         */
-        uint64_t compiled_fs_id;
-
-        enum pipe_format attr_formats[8];
-        bool is_coord;
-        bool per_vertex_point_size;
-};
+static struct qreg
+ntq_get_src(struct vc4_compile *c, nir_src src, int i);
 
 static void
 resize_qreg_array(struct vc4_compile *c,
@@ -113,10 +68,10 @@ resize_qreg_array(struct vc4_compile *c,
 }
 
 static struct qreg
-indirect_uniform_load(struct vc4_compile *c,
-                      struct qreg indirect_offset,
-                      unsigned offset)
+indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
+        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+        uint32_t offset = intr->const_index[0];
         struct vc4_compiler_ubo_range *range = NULL;
         unsigned i;
         for (i = 0; i < c->num_uniform_ranges; i++) {
@@ -138,10 +93,6 @@ indirect_uniform_load(struct vc4_compile *c,
         };
 
         offset -= range->src_offset;
-        /* Translate the user's TGSI register index from the TGSI register
-         * base to a byte offset.
-         */
-        indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
 
         /* Adjust for where we stored the TGSI register base. */
         indirect_offset = qir_ADD(c, indirect_offset,
@@ -155,24 +106,70 @@ indirect_uniform_load(struct vc4_compile *c,
                                                      range->size - 4)));
 
         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
-        struct qreg r4 = qir_TEX_RESULT(c);
         c->num_texture_samples++;
-        return qir_MOV(c, r4);
+        return qir_TEX_RESULT(c);
 }
 
-static struct qreg *
-ntq_get_dest(struct vc4_compile *c, nir_dest dest)
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents)
 {
-        assert(!dest.is_ssa);
-        nir_register *reg = dest.reg.reg;
-        struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg);
-        assert(reg->num_array_elems == 0);
-        assert(dest.reg.base_offset == 0);
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_uniform);
+        intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents;
+        intr->num_components = 1;
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_builder_instr_insert(b, &intr->instr);
+        return &intr->dest.ssa;
+}
 
-        struct qreg *qregs = entry->data;
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
+static struct qreg *
+ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
+{
+        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                          def->num_components);
+        _mesa_hash_table_insert(c->def_ht, def, qregs);
         return qregs;
 }
 
+static struct qreg *
+ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
+{
+        if (dest->is_ssa) {
+                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
+                for (int i = 0; i < dest->ssa.num_components; i++)
+                        qregs[i] = c->undef;
+                return qregs;
+        } else {
+                nir_register *reg = dest->reg.reg;
+                assert(dest->reg.base_offset == 0);
+                assert(reg->num_array_elems == 0);
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, reg);
+                return entry->data;
+        }
+}
+
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
 {
@@ -282,22 +279,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
 }
 
 static struct qreg
-qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
-{
-        struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
-        struct qreg high = qir_FSUB(c,
-                                    qir_FMUL(c,
-                                             qir_uniform_f(c, 1.055),
-                                             qir_POW(c,
-                                                     linear,
-                                                     qir_uniform_f(c, 0.41666))),
-                                    qir_uniform_f(c, 0.055));
-
-        qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
-        return qir_SEL_X_Y_NS(c, low, high);
-}
-
-static struct qreg
 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 {
         struct qreg src0_hi = qir_SHR(c, src0,
@@ -410,13 +391,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 
         c->num_texture_samples++;
-        struct qreg r4 = qir_TEX_RESULT(c);
+        struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
                                                          qir_uniform_ui(c, 8)));
                 struct qreg normalized = qir_FMUL(c, depthf,
                                                   qir_uniform_f(c, 1.0f/0xffffff));
@@ -468,7 +449,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         unpacked[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_R4_UNPACK(c, r4, i);
+                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -484,7 +465,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                                             texture_output[i]);
         }
 
-        struct qreg *dest = ntq_get_dest(c, instr->dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
                 dest[i] = get_swizzled_channel(c, texture_output,
                                                c->key->tex[unit].swizzle[i]);
@@ -558,7 +539,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src)
         struct qreg scaled_x =
                 qir_FMUL(c,
                          src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
 
         struct qreg x = qir_FADD(c,
                                  ntq_ffract(c, scaled_x),
@@ -756,26 +737,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr)
         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
 }
 
-static void
-emit_point_coord_input(struct vc4_compile *c, int attr)
-{
-        if (c->point_x.file == QFILE_NULL) {
-                c->point_x = qir_uniform_f(c, 0.0);
-                c->point_y = qir_uniform_f(c, 0.0);
-        }
-
-        c->inputs[attr * 4 + 0] = c->point_x;
-        if (c->fs_key->point_coord_upper_left) {
-                c->inputs[attr * 4 + 1] = qir_FSUB(c,
-                                                   qir_uniform_f(c, 1.0),
-                                                   c->point_y);
-        } else {
-                c->inputs[attr * 4 + 1] = c->point_y;
-        }
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static struct qreg
 emit_fragment_varying(struct vc4_compile *c, uint8_t semantic,
                       uint8_t index, uint8_t swizzle)
@@ -817,19 +778,6 @@ emit_fragment_input(struct vc4_compile *c, int attr,
 }
 
 static void
-emit_face_input(struct vc4_compile *c, int attr)
-{
-        c->inputs[attr * 4 + 0] = qir_FSUB(c,
-                                           qir_uniform_f(c, 1.0),
-                                           qir_FMUL(c,
-                                                    qir_ITOF(c, qir_FRAG_REV_FLAG(c)),
-                                                    qir_uniform_f(c, 2.0)));
-        c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
-static void
 add_output(struct vc4_compile *c,
            uint32_t decl_offset,
            uint8_t semantic_name,
@@ -884,12 +832,38 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
-                struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         dest[i] = srcs[i];
                 return;
         }
 
+        if (instr->op == nir_op_pack_unorm_4x8) {
+                struct qreg result;
+                for (int i = 0; i < 4; i++) {
+                        struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                                      instr->src[0].swizzle[i]);
+                        if (i == 0)
+                                result = qir_PACK_8888_F(c, src);
+                        else
+                                result = qir_PACK_8_F(c, result, src, i);
+                }
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = result;
+                return;
+        }
+
+        if (instr->op == nir_op_unpack_unorm_4x8) {
+                struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                              instr->src[0].swizzle[0]);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                for (int i = 0; i < 4; i++) {
+                        if (instr->dest.write_mask & (1 << i))
+                                dest[i] = qir_UNPACK_8_F(c, src, i);
+                }
+                return;
+        }
+
         /* General case: We can just grab the one used channel per src. */
         struct qreg src[nir_op_infos[instr->op].num_inputs];
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
@@ -898,7 +872,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 
         /* Pick the channel to store the output in. */
         assert(!instr->dest.saturate);
-        struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
         assert(util_is_power_of_two(instr->dest.write_mask));
         dest += ffs(instr->dest.write_mask) - 1;
 
@@ -1092,167 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 }
 
-static struct qreg
-vc4_blend_channel(struct vc4_compile *c,
-                  struct qreg *dst,
-                  struct qreg *src,
-                  struct qreg val,
-                  unsigned factor,
-                  int channel)
-{
-        switch(factor) {
-        case PIPE_BLENDFACTOR_ONE:
-                return val;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return qir_FMUL(c, val, src[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return qir_FMUL(c, val, src[3]);
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return qir_FMUL(c, val, dst[3]);
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return qir_FMUL(c, val, dst[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                if (channel != 3) {
-                        return qir_FMUL(c,
-                                        val,
-                                        qir_FMIN(c,
-                                                 src[3],
-                                                 qir_FSUB(c,
-                                                          qir_uniform_f(c, 1.0),
-                                                          dst[3])));
-                } else {
-                        return val;
-                }
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
-                                            channel));
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
-        case PIPE_BLENDFACTOR_ZERO:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[channel]));
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[3]));
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[3]));
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[channel]));
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel)));
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3)));
-
-        default:
-        case PIPE_BLENDFACTOR_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_SRC1_ALPHA:
-        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend factor %d\n", factor);
-                return val;
-        }
-}
-
-static struct qreg
-vc4_blend_func(struct vc4_compile *c,
-               struct qreg src, struct qreg dst,
-               unsigned func)
-{
-        switch (func) {
-        case PIPE_BLEND_ADD:
-                return qir_FADD(c, src, dst);
-        case PIPE_BLEND_SUBTRACT:
-                return qir_FSUB(c, src, dst);
-        case PIPE_BLEND_REVERSE_SUBTRACT:
-                return qir_FSUB(c, dst, src);
-        case PIPE_BLEND_MIN:
-                return qir_FMIN(c, src, dst);
-        case PIPE_BLEND_MAX:
-                return qir_FMAX(c, src, dst);
-
-        default:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend func %d\n", func);
-                return src;
-
-        }
-}
-
-/**
- * Implements fixed function blending in shader code.
- *
- * VC4 doesn't have any hardware support for blending.  Instead, you read the
- * current contents of the destination from the tile buffer after having
- * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
- * math using your output color and that destination value, and update the
- * output color appropriately.
- */
-static void
-vc4_blend(struct vc4_compile *c, struct qreg *result,
-          struct qreg *dst_color, struct qreg *src_color)
-{
-        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
-
-        if (!blend->blend_enable) {
-                for (int i = 0; i < 4; i++)
-                        result[i] = src_color[i];
-                return;
-        }
-
-        struct qreg clamped_src[4];
-        struct qreg clamped_dst[4];
-        for (int i = 0; i < 4; i++) {
-                clamped_src[i] = qir_SAT(c, src_color[i]);
-                clamped_dst[i] = qir_SAT(c, dst_color[i]);
-        }
-        src_color = clamped_src;
-        dst_color = clamped_dst;
-
-        struct qreg src_blend[4], dst_blend[4];
-        for (int i = 0; i < 3; i++) {
-                src_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 src_color[i],
-                                                 blend->rgb_src_factor, i);
-                dst_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 dst_color[i],
-                                                 blend->rgb_dst_factor, i);
-        }
-        src_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         src_color[3],
-                                         blend->alpha_src_factor, 3);
-        dst_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         dst_color[3],
-                                         blend->alpha_dst_factor, 3);
-
-        for (int i = 0; i < 3; i++) {
-                result[i] = vc4_blend_func(c,
-                                           src_blend[i], dst_blend[i],
-                                           blend->rgb_func);
-        }
-        result[3] = vc4_blend_func(c,
-                                   src_blend[3], dst_blend[3],
-                                   blend->alpha_func);
-}
-
 static void
 clip_distance_discard(struct vc4_compile *c)
 {
@@ -1276,167 +1089,15 @@ clip_distance_discard(struct vc4_compile *c)
 }
 
 static void
-alpha_test_discard(struct vc4_compile *c)
-{
-        struct qreg src_alpha;
-        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
-
-        if (!c->fs_key->alpha_test)
-                return;
-
-        if (c->output_color_index != -1)
-                src_alpha = c->outputs[c->output_color_index + 3];
-        else
-                src_alpha = qir_uniform_f(c, 1.0);
-
-        if (c->discard.file == QFILE_NULL)
-                c->discard = qir_uniform_ui(c, 0);
-
-        switch (c->fs_key->alpha_test_func) {
-        case PIPE_FUNC_NEVER:
-                c->discard = qir_uniform_ui(c, ~0);
-                break;
-        case PIPE_FUNC_ALWAYS:
-                break;
-        case PIPE_FUNC_EQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_NOTEQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GREATER:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LESS:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        }
-}
-
-static struct qreg
-vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
-{
-        switch (c->fs_key->logicop_func) {
-        case PIPE_LOGICOP_CLEAR:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_LOGICOP_NOR:
-                return qir_NOT(c, qir_OR(c, src, dst));
-        case PIPE_LOGICOP_AND_INVERTED:
-                return qir_AND(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_COPY_INVERTED:
-                return qir_NOT(c, src);
-        case PIPE_LOGICOP_AND_REVERSE:
-                return qir_AND(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_INVERT:
-                return qir_NOT(c, dst);
-        case PIPE_LOGICOP_XOR:
-                return qir_XOR(c, src, dst);
-        case PIPE_LOGICOP_NAND:
-                return qir_NOT(c, qir_AND(c, src, dst));
-        case PIPE_LOGICOP_AND:
-                return qir_AND(c, src, dst);
-        case PIPE_LOGICOP_EQUIV:
-                return qir_NOT(c, qir_XOR(c, src, dst));
-        case PIPE_LOGICOP_NOOP:
-                return dst;
-        case PIPE_LOGICOP_OR_INVERTED:
-                return qir_OR(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_OR_REVERSE:
-                return qir_OR(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_OR:
-                return qir_OR(c, src, dst);
-        case PIPE_LOGICOP_SET:
-                return qir_uniform_ui(c, ~0);
-        case PIPE_LOGICOP_COPY:
-        default:
-                return src;
-        }
-}
-
-static void
 emit_frag_end(struct vc4_compile *c)
 {
         clip_distance_discard(c);
-        alpha_test_discard(c);
-
-        enum pipe_format color_format = c->fs_key->color_format;
-        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
-        struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg packed_dst_color = c->undef;
-
-        if (c->fs_key->blend.blend_enable ||
-            c->fs_key->blend.colormask != 0xf ||
-            c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                struct qreg r4 = qir_TLB_COLOR_READ(c);
-                for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
-                for (int i = 0; i < 4; i++) {
-                        dst_color[i] = get_swizzled_channel(c,
-                                                            tlb_read_color,
-                                                            format_swiz[i]);
-                        if (util_format_is_srgb(color_format) && i != 3) {
-                                linear_dst_color[i] =
-                                        qir_srgb_decode(c, dst_color[i]);
-                        } else {
-                                linear_dst_color[i] = dst_color[i];
-                        }
-                }
 
-                /* Save the packed value for logic ops.  Can't reuse r4
-                 * because other things might smash it (like sRGB)
-                 */
-                packed_dst_color = qir_MOV(c, r4);
-        }
-
-        struct qreg blend_color[4];
-        struct qreg undef_array[4] = {
-                c->undef, c->undef, c->undef, c->undef
-        };
-        vc4_blend(c, blend_color, linear_dst_color,
-                  (c->output_color_index != -1 ?
-                   c->outputs + c->output_color_index :
-                   undef_array));
-
-        if (util_format_is_srgb(color_format)) {
-                for (int i = 0; i < 3; i++)
-                        blend_color[i] = qir_srgb_encode(c, blend_color[i]);
-        }
-
-        /* Debug: Sometimes you're getting a black output and just want to see
-         * if the FS is getting executed at all.  Spam magenta into the color
-         * output.
-         */
-        if (0) {
-                blend_color[0] = qir_uniform_f(c, 1.0);
-                blend_color[1] = qir_uniform_f(c, 0.0);
-                blend_color[2] = qir_uniform_f(c, 1.0);
-                blend_color[3] = qir_uniform_f(c, 0.5);
-        }
-
-        struct qreg swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
-                                                           format_swiz[i]);
+        struct qreg color;
+        if (c->output_color_index != -1) {
+                color = c->outputs[c->output_color_index];
+        } else {
+                color = qir_uniform_ui(c, 0);
         }
 
         if (c->discard.file != QFILE_NULL)
@@ -1463,47 +1124,7 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        struct qreg packed_color = c->undef;
-        for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file == QFILE_NULL)
-                        continue;
-                if (packed_color.file == QFILE_NULL) {
-                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
-                } else {
-                        packed_color = qir_PACK_8_F(c,
-                                                    packed_color,
-                                                    swizzled_outputs[i],
-                                                    i);
-                }
-        }
-
-        if (packed_color.file == QFILE_NULL)
-                packed_color = qir_uniform_ui(c, 0);
-
-        if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_color = vc4_logicop(c, packed_color, packed_dst_color);
-        }
-
-        /* If the bit isn't set in the color mask, then just return the
-         * original dst color, instead.
-         */
-        uint32_t colormask = 0xffffffff;
-        for (int i = 0; i < 4; i++) {
-                if (format_swiz[i] < 4 &&
-                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
-                        colormask &= ~(0xff << (i * 8));
-                }
-        }
-        if (colormask != 0xffffffff) {
-                packed_color = qir_OR(c,
-                                      qir_AND(c, packed_color,
-                                              qir_uniform_ui(c, colormask)),
-                                      qir_AND(c, packed_dst_color,
-                                              qir_uniform_ui(c, ~colormask)));
-        }
-
-        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef,
-                             packed_color, c->undef));
+        qir_TLB_COLOR_WRITE(c, color);
 }
 
 static void
@@ -1695,6 +1316,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 progress = nir_opt_peephole_select(s) || progress;
                 progress = nir_opt_algebraic(s) || progress;
                 progress = nir_opt_constant_folding(s) || progress;
+                progress = nir_opt_undef(s) || progress;
         } while (progress);
 }
 
@@ -1736,6 +1358,7 @@ ntq_setup_inputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location;
 
                 assert(array_len == 1);
+                (void)array_len;
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
 
@@ -1743,11 +1366,12 @@ ntq_setup_inputs(struct vc4_compile *c)
                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
                                 emit_fragcoord_input(c, loc);
                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
-                                emit_face_input(c, loc);
+                                c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c);
                         } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
                                    (c->fs_key->point_sprite_mask &
                                     (1 << semantic_index))) {
-                                emit_point_coord_input(c, loc);
+                                c->inputs[loc * 4 + 0] = c->point_x;
+                                c->inputs[loc * 4 + 1] = c->point_y;
                         } else {
                                 emit_fragment_input(c, loc,
                                                     semantic_name,
@@ -1770,6 +1394,13 @@ ntq_setup_outputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location * 4;
 
                 assert(array_len == 1);
+                (void)array_len;
+
+                /* NIR hack to pass through
+                 * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
+                if (semantic_name == TGSI_SEMANTIC_COLOR &&
+                    semantic_index == -1)
+                        semantic_index = 0;
 
                 for (int i = 0; i < 4; i++) {
                         add_output(c,
@@ -1834,8 +1465,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
-        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                          instr->def.num_components);
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
 
@@ -1843,47 +1473,59 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 }
 
 static void
+ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+        /* QIR needs there to be *some* value, so pick 0 (same as for
+         * ntq_setup_registers().
+         */
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = qir_uniform_ui(c, 0);
+}
+
+static void
 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
         const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
         struct qreg *dest = NULL;
 
         if (info->has_dest) {
-                dest = ntq_get_dest(c, instr->dest);
+                dest = ntq_get_dest(c, &instr->dest);
         }
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
-                                              instr->const_index[0] * 4 + i);
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) {
+                        *dest = qir_uniform(c, QUNIFORM_UNIFORM,
+                                            instr->const_index[0]);
+                } else {
+                        *dest = qir_uniform(c, instr->const_index[0] -
+                                            VC4_NIR_STATE_UNIFORM_OFFSET,
+                                            0);
                 }
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = indirect_uniform_load(c,
-                                                        ntq_get_src(c, instr->src[0], 0),
-                                                        (instr->const_index[0] *
-                                                         4 + i) * sizeof(float));
-                }
+                *dest = indirect_uniform_load(c, instr);
 
                 break;
 
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++)
-                        dest[i] = c->inputs[instr->const_index[0] * 4 + i];
-
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        *dest = qir_TLB_COLOR_READ(c);
+                } else {
+                        *dest = c->inputs[instr->const_index[0]];
+                }
                 break;
 
         case nir_intrinsic_store_output:
-                for (int i = 0; i < instr->num_components; i++) {
-                        c->outputs[instr->const_index[0] * 4 + i] =
-                                qir_MOV(c, ntq_get_src(c, instr->src[0], i));
-                }
-                c->num_outputs = MAX2(c->num_outputs,
-                                      instr->const_index[0] * 4 +
-                                      instr->num_components + 1);
+                assert(instr->num_components == 1);
+                c->outputs[instr->const_index[0]] =
+                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
                 break;
 
         case nir_intrinsic_discard:
@@ -1927,6 +1569,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
+        case nir_instr_type_ssa_undef:
+                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+                break;
+
         case nir_instr_type_tex:
                 ntq_emit_tex(c, nir_instr_as_tex(instr));
                 break;
@@ -2084,13 +1730,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        if (stage == QSTAGE_FRAG)
+                vc4_nir_lower_blend(c);
+        vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
+        nir_lower_load_const_to_scalar(c->s);
 
         vc4_optimize_nir(c->s);
 
         nir_remove_dead_variables(c->s);
 
-        nir_convert_from_ssa(c->s);
+        nir_convert_from_ssa(c->s, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
@@ -2187,6 +1837,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
         memcpy(uinfo->contents, c->uniform_contents,
                count * sizeof(*uinfo->contents));
         uinfo->num_texture_samples = c->num_texture_samples;
+
+        vc4_set_shader_uniform_dirty_flags(shader);
 }
 
 static struct vc4_compiled_shader *
@@ -2259,9 +1911,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         }
 
         copy_uniform_state_to_shader(shader, c);
-        shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts,
-                                      c->qpu_inst_count * sizeof(uint64_t),
-                                      "code");
+        shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
+                                         c->qpu_inst_count * sizeof(uint64_t));
 
         /* Copy the compiler UBO range state to the compiled shader, dropping
          * out arrays that were never referenced by an indirect load.
@@ -2288,10 +1939,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                 }
         }
         if (shader->ubo_size) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                        qir_get_stage_name(c->stage),
-                        c->program_id, c->variant_id,
-                        shader->ubo_size / 4);
+                if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                qir_get_stage_name(c->stage),
+                                c->program_id, c->variant_id,
+                                shader->ubo_size / 4);
+                }
         }
 
         qir_compile_destroy(c);
@@ -2421,9 +2074,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
                 (prim_mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex);
 
-        vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        struct vc4_compiled_shader *vs =
+                vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        if (vs != vc4->prog.vs) {
+                vc4->prog.vs = vs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_VS;
+        }
+
         key->is_coord = true;
-        vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        struct vc4_compiled_shader *cs =
+                vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        if (cs != vc4->prog.cs) {
+                vc4->prog.cs = cs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_CS;
+        }
 }
 
 void
@@ -2490,305 +2154,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         free(so);
 }
 
-static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
-{
-        switch (p_wrap) {
-        case PIPE_TEX_WRAP_REPEAT:
-                return 0;
-        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-                return 1;
-        case PIPE_TEX_WRAP_MIRROR_REPEAT:
-                return 2;
-        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-                return 3;
-        case PIPE_TEX_WRAP_CLAMP:
-                return (using_nearest ? 1 : 3);
-        default:
-                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
-                assert(!"not reached");
-                return 0;
-        }
-}
-
-static void
-write_texture_p0(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_reloc(vc4, &vc4->uniforms, rsc->bo,
-                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
-                 VC4_SET_FIELD(texture->u.tex.last_level -
-                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
-                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
-                               VC4_TEX_P0_CMMODE) |
-                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
-}
-
-static void
-write_texture_p1(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        static const uint8_t minfilter_map[6] = {
-                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
-                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
-                VC4_TEX_P1_MINFILT_NEAREST,
-                VC4_TEX_P1_MINFILT_LINEAR,
-        };
-        static const uint32_t magfilter_map[] = {
-                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
-                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
-        };
-
-        bool either_nearest =
-                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
-                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
-               VC4_SET_FIELD(texture->texture->height0 & 2047,
-                             VC4_TEX_P1_HEIGHT) |
-               VC4_SET_FIELD(texture->texture->width0 & 2047,
-                             VC4_TEX_P1_WIDTH) |
-               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
-                             VC4_TEX_P1_MAGFILT) |
-               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
-                                           sampler->min_img_filter],
-                             VC4_TEX_P1_MINFILT) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
-                             VC4_TEX_P1_WRAP_S) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
-                             VC4_TEX_P1_WRAP_T));
-}
-
-static void
-write_texture_p2(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t data)
-{
-        uint32_t unit = data & 0xffff;
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
-                             VC4_TEX_P2_PTYPE) |
-               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
-               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
-}
-
-
-#define SWIZ(x,y,z,w) {          \
-        UTIL_FORMAT_SWIZZLE_##x, \
-        UTIL_FORMAT_SWIZZLE_##y, \
-        UTIL_FORMAT_SWIZZLE_##z, \
-        UTIL_FORMAT_SWIZZLE_##w  \
-}
-
-static void
-write_texture_border_color(struct vc4_context *vc4,
-                           struct vc4_texture_stateobj *texstate,
-                           uint32_t unit)
-{
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        union util_color uc;
-
-        const struct util_format_description *tex_format_desc =
-                util_format_description(texture->format);
-
-        float border_color[4];
-        for (int i = 0; i < 4; i++)
-                border_color[i] = sampler->border_color.f[i];
-        if (util_format_is_srgb(texture->format)) {
-                for (int i = 0; i < 3; i++)
-                        border_color[i] =
-                                util_format_linear_to_srgb_float(border_color[i]);
-        }
-
-        /* Turn the border color into the layout of channels that it would
-         * have when stored as texture contents.
-         */
-        float storage_color[4];
-        util_format_unswizzle_4f(storage_color,
-                                 border_color,
-                                 tex_format_desc->swizzle);
-
-        /* Now, pack so that when the vc4_format-sampled texture contents are
-         * replaced with our border color, the vc4_get_format_swizzle()
-         * swizzling will get the right channels.
-         */
-        if (util_format_is_depth_or_stencil(texture->format)) {
-                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
-                                       sampler->border_color.f[0]) << 8;
-        } else {
-                switch (rsc->vc4_format) {
-                default:
-                case VC4_TEXTURE_TYPE_RGBA8888:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGBA4444:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGB565:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_ALPHA:
-                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
-                        break;
-                case VC4_TEXTURE_TYPE_LUMALPHA:
-                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
-                                    (float_to_ubyte(storage_color[0]) << 0));
-                        break;
-                }
-        }
-
-        cl_aligned_u32(&vc4->uniforms, uc.ui[0]);
-}
-
-static uint32_t
-get_texrect_scale(struct vc4_texture_stateobj *texstate,
-                  enum quniform_contents contents,
-                  uint32_t data)
-{
-        struct pipe_sampler_view *texture = texstate->textures[data];
-        uint32_t dim;
-
-        if (contents == QUNIFORM_TEXRECT_SCALE_X)
-                dim = texture->texture->width0;
-        else
-                dim = texture->texture->height0;
-
-        return fui(1.0f / dim);
-}
-
-static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-               const uint32_t *gallium_uniforms)
-{
-        if (!shader->ubo_size)
-                return NULL;
-
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
-        uint32_t *data = vc4_bo_map(ubo);
-        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
-                memcpy(data + shader->ubo_ranges[i].dst_offset,
-                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
-                       shader->ubo_ranges[i].size);
-        }
-
-        return ubo;
-}
-
-void
-vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-                   struct vc4_constbuf_stateobj *cb,
-                   struct vc4_texture_stateobj *texstate)
-{
-        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
-        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
-
-        cl_ensure_space(&vc4->uniforms, (uinfo->count +
-                                         uinfo->num_texture_samples) * 4);
-
-        cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
-
-        for (int i = 0; i < uinfo->count; i++) {
-
-                switch (uinfo->contents[i]) {
-                case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&vc4->uniforms, uinfo->data[i]);
-                        break;
-                case QUNIFORM_UNIFORM:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       gallium_uniforms[uinfo->data[i]]);
-                        break;
-                case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f);
-                        break;
-                case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f);
-                        break;
-
-                case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]);
-                        break;
-                case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]);
-                        break;
-
-                case QUNIFORM_USER_CLIP_PLANE:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0);
-                        break;
-
-                case QUNIFORM_TEXTURE_BORDER_COLOR:
-                        write_texture_border_color(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXRECT_SCALE_X:
-                case QUNIFORM_TEXRECT_SCALE_Y:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       get_texrect_scale(texstate,
-                                                         uinfo->contents[i],
-                                                         uinfo->data[i]));
-                        break;
-
-                case QUNIFORM_BLEND_CONST_COLOR:
-                        cl_aligned_f(&vc4->uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
-                        break;
-
-                case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
-                                        0));
-                        break;
-
-                case QUNIFORM_ALPHA_REF:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->zsa->base.alpha.ref_value);
-                        break;
-                }
-#if 0
-                uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4);
-                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
-                        shader, i, written_val, uif(written_val));
-#endif
-        }
-}
-
 static void
 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
-        [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
-        [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
-        [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
         [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
         [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
         [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
-bool
-qir_reads_r4(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_R4_UNPACK_A:
-        case QOP_R4_UNPACK_B:
-        case QOP_R4_UNPACK_C:
-        case QOP_R4_UNPACK_D:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 732cfd0b306..cade795c12a 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -36,6 +36,11 @@
 #include "util/list.h"
 #include "util/u_math.h"
 
+#include "vc4_screen.h"
+#include "pipe/p_state.h"
+
+struct nir_builder;
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -155,10 +160,6 @@ enum qop {
          * the destination
          */
         QOP_TEX_RESULT,
-        QOP_R4_UNPACK_A,
-        QOP_R4_UNPACK_B,
-        QOP_R4_UNPACK_C,
-        QOP_R4_UNPACK_D
 };
 
 struct queued_qpu_inst {
@@ -243,7 +244,11 @@ enum quniform_contents {
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_BLEND_CONST_COLOR,
+        QUNIFORM_BLEND_CONST_COLOR_X,
+        QUNIFORM_BLEND_CONST_COLOR_Y,
+        QUNIFORM_BLEND_CONST_COLOR_Z,
+        QUNIFORM_BLEND_CONST_COLOR_W,
+
         QUNIFORM_STENCIL,
 
         QUNIFORM_ALPHA_REF,
@@ -280,6 +285,52 @@ struct vc4_compiler_ubo_range {
         bool used;
 };
 
+struct vc4_key {
+        struct vc4_uncompiled_shader *shader_state;
+        struct {
+                enum pipe_format format;
+                unsigned compare_mode:1;
+                unsigned compare_func:3;
+                unsigned wrap_s:3;
+                unsigned wrap_t:3;
+                uint8_t swizzle[4];
+        } tex[VC4_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct vc4_fs_key {
+        struct vc4_key base;
+        enum pipe_format color_format;
+        bool depth_enabled;
+        bool stencil_enabled;
+        bool stencil_twoside;
+        bool stencil_full_writemasks;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct vc4_vs_key {
+        struct vc4_key base;
+
+        /**
+         * This is a proxy for the array of FS input semantics, which is
+         * larger than we would want to put in the key.
+         */
+        uint64_t compiled_fs_id;
+
+        enum pipe_format attr_formats[8];
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct vc4_compile {
         struct vc4_context *vc4;
         nir_shader *s;
@@ -369,6 +420,16 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
+
+/* Special offset for nir_load_uniform values to get a QUNIFORM_*
+ * state-dependent value.
+ */
+#define VC4_NIR_STATE_UNIFORM_OFFSET		2000000000
+
 struct vc4_compile *qir_compile_init(void);
 void qir_compile_destroy(struct vc4_compile *c);
 struct qinst *qir_inst(enum qop op, struct qreg dst,
@@ -393,7 +454,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
 bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
@@ -409,6 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_blend(struct vc4_compile *c);
+void vc4_nir_lower_io(struct vc4_compile *c);
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents);
+nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
+                                          nir_ssa_def **srcs, int swiz);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
@@ -523,27 +589,12 @@ QIR_ALU0(FRAG_W)
 QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
+QIR_NODST_1(TLB_COLOR_WRITE)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 
 static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
-        return t;
-}
-
-static inline struct qreg
-qir_SEL_X_0_COND(struct vc4_compile *c, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef));
-        return t;
-}
-
-static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
         struct qreg t = qir_get_temp(c);
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 910c89dca79..f087c3b81b5 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -52,7 +52,7 @@ static void
 add_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         if (entry) {
@@ -66,7 +66,7 @@ static void
 remove_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         assert(entry);
@@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct hash_entry *entry;
                 hash_table_foreach(ht, entry) {
                         uint32_t count = (uintptr_t)entry->data;
-                        uint32_t index = (uintptr_t)entry->key;
+                        uint32_t index = (uintptr_t)entry->key - 1;
                         if (count > max_count) {
                                 max_count = count;
                                 max_index = index;
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index c9ab6344589..fbb90ba12a0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); }
 static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); }
 static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); }
 
-uint64_t qpu_NOP(void);
-uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src);
-uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src);
+uint64_t qpu_NOP(void) ATTRIBUTE_CONST;
+uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
+uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
 uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
 uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
-uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
-uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
-uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
-uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond);
-uint32_t qpu_encode_small_immediate(uint32_t i);
-
-bool qpu_waddr_is_tlb(uint32_t waddr);
-bool qpu_inst_is_tlb(uint64_t inst);
-int qpu_num_sf_accesses(uint64_t inst);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST;
+uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST;
+uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST;
+
+bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST;
+bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
+int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
 void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
 
 static inline uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 55e0e6139b5..00aeb300a9b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -225,7 +225,7 @@ static const char *qpu_condflags[] = {
 };
 
 #define DESC(array, index)                                        \
-        ((index > ARRAY_SIZE(array) || !(array)[index]) ?         \
+        ((index >= ARRAY_SIZE(array) || !(array)[index]) ?         \
          "???" : (array)[index])
 
 static const char *
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 99afe4b8798..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VPM:
                                 assert((int)qinst->src[i].index >=
                                        last_vpm_read_index);
+                                (void)last_vpm_read_index;
                                 last_vpm_read_index = qinst->src[i].index;
                                 src[i] = qpu_ra(QPU_R_VPM);
                                 break;
@@ -319,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
 
                         break;
 
@@ -402,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
 
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
@@ -451,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-
-                        break;
-
-                case QOP_R4_UNPACK_A:
-                case QOP_R4_UNPACK_B:
-                case QOP_R4_UNPACK_C:
-                case QOP_R4_UNPACK_D:
-                        assert(src[0].mux == QPU_MUX_R4);
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_R4_UNPACK_A),
-                                                       QPU_UNPACK);
-
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_UNPACK_8A_F:
@@ -474,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8D_F:
                 case QOP_UNPACK_16A_F:
                 case QOP_UNPACK_16B_F: {
-                        assert(src[0].mux == QPU_MUX_A);
-
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                             qpu_rb(31) : dst),
-                                            src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_F],
-                                                       QPU_UNPACK);
+                        if (src[0].mux == QPU_MUX_R4) {
+                                queue(c, qpu_a_MOV(dst, src[0]));
+                                *last_inst(c) |= QPU_PM;
+                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+                                                               (qinst->op -
+                                                                QOP_UNPACK_8A_F),
+                                                               QPU_UNPACK);
+                        } else {
+                                assert(src[0].mux == QPU_MUX_A);
 
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                /* Since we're setting the pack bits, if the
+                                 * destination is in A it would get re-packed.
+                                 */
+                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                                     qpu_rb(31) : dst),
+                                                    src[0], src[0]));
+                                *last_inst(c) |=
+                                        QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                 QOP_UNPACK_8A_F],
+                                                      QPU_UNPACK);
+
+                                if (dst.mux == QPU_MUX_A) {
+                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                }
                         }
                 }
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 8471edbf62c..9cf6841f41c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -23,6 +23,13 @@
 
 #include "vc4_qpu.h"
 
+#ifdef NDEBUG
+/* Since most of our code is used in assert()s, don't warn about dead code. */
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 static bool
 writes_reg(uint64_t inst, uint32_t w)
 {
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3b0b890b66a..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
                 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                 /* R4 can't be written as a general purpose register. (it's
                  * TMU_NOSWAP as a write address).
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R4)
+                if (vc4_regs[i].mux == QPU_MUX_R4) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
                         continue;
+                }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
         }
 
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+        }
 
         ra_set_finalize(vc4->regs, NULL);
 }
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
         return a->priority - b->priority;
 }
 
+#define CLASS_BIT_A			(1 << 0)
+#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_R4			(1 << 2)
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
+        uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(g, i, vc4->reg_class_any);
-        }
-
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 temp_to_node[map[i].temp] = i;
         }
 
-        /* Figure out our register classes and preallocated registers*/
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (qir_writes_r4(inst)) {
+                        /* This instruction writes r4 (and optionally moves
+                         * its result to a temp), so nothing else can be
+                         * stored in r4 across it.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (def[i] < ip && use[i] > ip)
+                                        class_bits[i] &= ~CLASS_BIT_R4;
+                        }
+                } else {
+                        /* R4 can't be written as a general purpose
+                         * register. (it's TMU_NOSWAP as a write address).
+                         */
+                        if (inst->dst.file == QFILE_TEMP)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+                }
+
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_TEX_RESULT:
-                case QOP_TLB_COLOR_READ:
-                        assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
-                                        ACC_INDEX + 4);
-                        break;
-
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, temp_to_node[inst->dst.index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                         break;
 
                 default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 if (qir_src_needs_a_file(inst)) {
-                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                }
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                switch (class_bits[i]) {
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_any);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_R4:
+                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        break;
+                case CLASS_BIT_A:
+                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        break;
+                default:
+                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+                                i, class_bits[i]);
+                        abort();
+                        break;
                 }
         }
 
@@ -270,7 +315,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         bool ok = ra_allocate(g);
-        assert(ok);
+        if (!ok) {
+                fprintf(stderr, "Failed to register allocate:\n");
+                qir_dump(c);
+                abort();
+        }
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index cab76406055..5d5166fd818 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
                 vc4_resource_bo_alloc(rsc);
+
+                /* If it might be bound as one of our vertex buffers, make
+                 * sure we re-emit vertex buffer state.
+                 */
+                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                        vc4->dirty |= VC4_DIRTY_VTXBUF;
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
@@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                             prsc->height0 == box->height &&
                             prsc->depth0 == box->depth) {
                                 vc4_resource_bo_alloc(rsc);
+                                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                        vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
                                 vc4_flush(pctx);
                         }
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index ab8f5d3cd55..87571b75e8b 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -82,19 +82,19 @@ struct vc4_resource {
         struct pipe_resource *shadow_parent;
 };
 
-static INLINE struct vc4_resource *
+static inline struct vc4_resource *
 vc4_resource(struct pipe_resource *prsc)
 {
         return (struct vc4_resource *)prsc;
 }
 
-static INLINE struct vc4_surface *
+static inline struct vc4_surface *
 vc4_surface(struct pipe_surface *psurf)
 {
         return (struct vc4_surface *)psurf;
 }
 
-static INLINE struct vc4_transfer *
+static inline struct vc4_transfer *
 vc4_transfer(struct pipe_transfer *ptrans)
 {
         return (struct vc4_transfer *)ptrans;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index f63bead0fbb..2dee1d40e5f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -176,6 +176,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
                 return 0;
 
                 /* Stream output. */
@@ -489,6 +493,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
 {
         whandle->stride = stride;
 
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching or shadow update avoidance).
+         */
+        bo->private = false;
+
         switch (whandle->type) {
         case DRM_API_HANDLE_TYPE_SHARED:
                 return vc4_bo_flink(bo, &whandle->handle);
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index b58013dd2ee..7cfd236349d 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -74,11 +74,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
         struct vc4_bo **bos = vc4->bo_pointers.base;
 
         exec->bo_count = args->bo_handle_count;
-        exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state));
+        exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
 #if 0
                 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
 #endif
@@ -86,7 +87,16 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 
-                exec->bo[i].bo = obj;
+                exec->bo[i] = obj;
+
+                /* The kernel does this validation at shader create ioctl
+                 * time.
+                 */
+                if (strcmp(bo->name, "code") == 0) {
+                        drm_bo->validated_shader = vc4_validate_shader(obj);
+                        if (!drm_bo->validated_shader)
+                                abort();
+                }
         }
         return 0;
 }
@@ -95,7 +105,7 @@ static int
 vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
-                struct drm_gem_cma_object *obj = exec->bo[i].bo;
+                struct drm_gem_cma_object *obj = exec->bo[i];
                 struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 2bb36b253bb..68ace0216aa 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,6 +78,7 @@ struct drm_gem_cma_object {
 struct drm_vc4_bo {
         struct drm_gem_cma_object base;
         struct vc4_bo *bo;
+        struct vc4_validated_shader_info *validated_shader;
         struct list_head unref_head;
 };
 
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 4a1d4c3a4d6..8a759c2ca4c 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
          * BCM21553).
          */
-        so->point_size = MAX2(cso->point_size, .125);
+        so->point_size = MAX2(cso->point_size, .125f);
 
         if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;
@@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader)
         }
 }
 
+static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
+{
+        switch (p_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
+                assert(!"not reached");
+                return 0;
+        }
+}
+
 static void *
 vc4_create_sampler_state(struct pipe_context *pctx,
                          const struct pipe_sampler_state *cso)
 {
-        return vc4_generic_cso_state_create(cso, sizeof(*cso));
+        static const uint8_t minfilter_map[6] = {
+                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
+                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
+                VC4_TEX_P1_MINFILT_NEAREST,
+                VC4_TEX_P1_MINFILT_LINEAR,
+        };
+        static const uint32_t magfilter_map[] = {
+                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
+                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
+        };
+        bool either_nearest =
+                (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+        struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so, cso, sizeof(*cso));
+
+        so->texture_p1 =
+                (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter],
+                               VC4_TEX_P1_MAGFILT) |
+                 VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 +
+                                             cso->min_img_filter],
+                               VC4_TEX_P1_MINFILT) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest),
+                               VC4_TEX_P1_WRAP_S) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest),
+                               VC4_TEX_P1_WRAP_T));
+
+        return so;
 }
 
 static void
@@ -499,13 +552,13 @@ static struct pipe_sampler_view *
 vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                         const struct pipe_sampler_view *cso)
 {
-        struct pipe_sampler_view *so = malloc(sizeof(*so));
+        struct vc4_sampler_view *so = malloc(sizeof(*so));
         struct vc4_resource *rsc = vc4_resource(prsc);
 
         if (!so)
                 return NULL;
 
-        *so = *cso;
+        so->base = *cso;
 
         pipe_reference(NULL, &prsc->reference);
 
@@ -516,18 +569,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
          * Also, Raspberry Pi doesn't support sampling from raster textures,
          * so we also have to copy to a temporary then.
          */
-        if (so->u.tex.first_level ||
+        if (cso->u.tex.first_level ||
             rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
                 struct vc4_resource *shadow_parent = vc4_resource(prsc);
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
                 tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
-                tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
-                tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
-                tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
+                tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
+                tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
+                tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
@@ -535,11 +589,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
 
                 assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
-        so->texture = prsc;
-        so->reference.count = 1;
-        so->context = pctx;
-
-        return so;
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
+
+        so->texture_p0 =
+                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
+                 VC4_SET_FIELD(cso->u.tex.last_level -
+                               cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
+                 VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE,
+                               VC4_TEX_P0_CMMODE));
+        so->texture_p1 =
+                (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
+                 VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
+                 VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
+
+        return &so->base;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c
index f9801c9cefd..cf86eb0fa31 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/src/gallium/drivers/vc4/vc4_tiling.c
@@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
 static void
 check_box_utile_alignment(const struct pipe_box *box, int cpp)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-
-        assert(!(box->x & (utile_w - 1)));
-        assert(!(box->y & (utile_h - 1)));
-        assert(!(box->width & (utile_w - 1)));
-        assert(!(box->height & (utile_h - 1)));
+        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
+        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index b5d10da3417..b90bba70200 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,9 +24,9 @@
 #ifndef VC4_TILING_H
 #define VC4_TILING_H
 
-uint32_t vc4_utile_width(int cpp);
-uint32_t vc4_utile_height(int cpp);
-bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp);
+uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
 void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
 void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
new file mode 100644
index 00000000000..85d6998205e
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_pack_color.h"
+#include "util/format_srgb.h"
+
+#include "vc4_context.h"
+#include "vc4_qir.h"
+
+static void
+write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_resource *rsc = vc4_resource(sview->base.texture);
+
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0);
+}
+
+static void
+write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_sampler_state *sampler =
+                vc4_sampler_state(texstate->samplers[unit]);
+
+        cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1);
+}
+
+static void
+write_texture_p2(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t data)
+{
+        uint32_t unit = data & 0xffff;
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_u32(uniforms,
+               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
+                             VC4_TEX_P2_PTYPE) |
+               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
+               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
+}
+
+
+#define SWIZ(x,y,z,w) {          \
+        UTIL_FORMAT_SWIZZLE_##x, \
+        UTIL_FORMAT_SWIZZLE_##y, \
+        UTIL_FORMAT_SWIZZLE_##z, \
+        UTIL_FORMAT_SWIZZLE_##w  \
+}
+
+static void
+write_texture_border_color(struct vc4_context *vc4,
+                           struct vc4_cl_out **uniforms,
+                           struct vc4_texture_stateobj *texstate,
+                           uint32_t unit)
+{
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        union util_color uc;
+
+        const struct util_format_description *tex_format_desc =
+                util_format_description(texture->format);
+
+        float border_color[4];
+        for (int i = 0; i < 4; i++)
+                border_color[i] = sampler->border_color.f[i];
+        if (util_format_is_srgb(texture->format)) {
+                for (int i = 0; i < 3; i++)
+                        border_color[i] =
+                                util_format_linear_to_srgb_float(border_color[i]);
+        }
+
+        /* Turn the border color into the layout of channels that it would
+         * have when stored as texture contents.
+         */
+        float storage_color[4];
+        util_format_unswizzle_4f(storage_color,
+                                 border_color,
+                                 tex_format_desc->swizzle);
+
+        /* Now, pack so that when the vc4_format-sampled texture contents are
+         * replaced with our border color, the vc4_get_format_swizzle()
+         * swizzling will get the right channels.
+         */
+        if (util_format_is_depth_or_stencil(texture->format)) {
+                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
+                                       sampler->border_color.f[0]) << 8;
+        } else {
+                switch (rsc->vc4_format) {
+                default:
+                case VC4_TEXTURE_TYPE_RGBA8888:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGBA4444:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGB565:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_ALPHA:
+                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
+                        break;
+                case VC4_TEXTURE_TYPE_LUMALPHA:
+                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
+                                    (float_to_ubyte(storage_color[0]) << 0));
+                        break;
+                }
+        }
+
+        cl_aligned_u32(uniforms, uc.ui[0]);
+}
+
+static uint32_t
+get_texrect_scale(struct vc4_texture_stateobj *texstate,
+                  enum quniform_contents contents,
+                  uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+        uint32_t dim;
+
+        if (contents == QUNIFORM_TEXRECT_SCALE_X)
+                dim = texture->texture->width0;
+        else
+                dim = texture->texture->height0;
+
+        return fui(1.0f / dim);
+}
+
+static struct vc4_bo *
+vc4_upload_ubo(struct vc4_context *vc4,
+               struct vc4_compiled_shader *shader,
+               const uint32_t *gallium_uniforms)
+{
+        if (!shader->ubo_size)
+                return NULL;
+
+        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
+        uint32_t *data = vc4_bo_map(ubo);
+        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
+                memcpy(data + shader->ubo_ranges[i].dst_offset,
+                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
+                       shader->ubo_ranges[i].size);
+        }
+
+        return ubo;
+}
+
+void
+vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+                   struct vc4_constbuf_stateobj *cb,
+                   struct vc4_texture_stateobj *texstate)
+{
+        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
+
+        cl_ensure_space(&vc4->uniforms, (uinfo->count +
+                                         uinfo->num_texture_samples) * 4);
+
+        struct vc4_cl_out *uniforms =
+                cl_start_shader_reloc(&vc4->uniforms,
+                                      uinfo->num_texture_samples);
+
+        for (int i = 0; i < uinfo->count; i++) {
+
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        break;
+                case QUNIFORM_UNIFORM:
+                        cl_aligned_u32(&uniforms,
+                                       gallium_uniforms[uinfo->data[i]]);
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
+                        break;
+
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
+                        break;
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        cl_aligned_f(&uniforms,
+                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                        write_texture_p0(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        write_texture_p1(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                        write_texture_p2(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
+                        break;
+
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                        write_texture_border_color(vc4, &uniforms,
+                                                   texstate, uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        cl_aligned_u32(&uniforms,
+                                       get_texrect_scale(texstate,
+                                                         uinfo->contents[i],
+                                                         uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        cl_aligned_f(&uniforms,
+                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
+                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                           0, 1));
+                        break;
+
+                case QUNIFORM_STENCIL:
+                        cl_aligned_u32(&uniforms,
+                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
+                                       (uinfo->data[i] <= 1 ?
+                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                        0));
+                        break;
+
+                case QUNIFORM_ALPHA_REF:
+                        cl_aligned_f(&uniforms,
+                                     vc4->zsa->base.alpha.ref_value);
+                        break;
+                }
+#if 0
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
+                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
+                        shader, i, written_val, uif(written_val));
+#endif
+        }
+
+        cl_end(&vc4->uniforms, uniforms);
+
+        vc4_bo_unreference(&ubo);
+}
+
+void
+vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
+{
+        uint32_t dirty = 0;
+
+        for (int i = 0; i < shader->uniforms.count; i++) {
+                switch (shader->uniforms.contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        break;
+                case QUNIFORM_UNIFORM:
+                case QUNIFORM_UBO_ADDR:
+                        dirty |= VC4_DIRTY_CONSTBUF;
+                        break;
+
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        dirty |= VC4_DIRTY_VIEWPORT;
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        dirty |= VC4_DIRTY_CLIP;
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        dirty |= VC4_DIRTY_TEXSTATE;
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        dirty |= VC4_DIRTY_BLEND_COLOR;
+                        break;
+
+                case QUNIFORM_STENCIL:
+                case QUNIFORM_ALPHA_REF:
+                        dirty |= VC4_DIRTY_ZSA;
+                        break;
+                }
+        }
+
+        shader->uniform_dirty_bits = dirty;
+}