256 files changed, 22026 insertions, 16637 deletions
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index 4b2629f77bd..cbf62c6daae 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -21,15 +21,16 @@ libfreedreno_la_SOURCES = \
 
 noinst_PROGRAMS = ir3_compiler
 
+# XXX: Required due to the C++ sources in libnir/libglsl_util
+nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp
 ir3_compiler_SOURCES = \
 	ir3/ir3_cmdline.c
 
 ir3_compiler_LDADD = \
 	libfreedreno.la \
-	../../auxiliary/libgallium.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
 	$(top_builddir)/src/libglsl_util.la \
-	-lstdc++ \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_COMMON_LIB_DEPS) \
 	$(FREEDRENO_LIBS)
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index a565a9c4e4d..baae9144005 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -120,18 +120,17 @@ ir3_SOURCES := \
 	ir3/disasm-a3xx.c \
 	ir3/instr-a3xx.h \
 	ir3/ir3.c \
-	ir3/ir3_compiler.c \
 	ir3/ir3_compiler_nir.c \
+	ir3/ir3_compiler.c \
 	ir3/ir3_compiler.h \
 	ir3/ir3_cp.c \
 	ir3/ir3_depth.c \
-	ir3/ir3_dump.c \
-	ir3/ir3_flatten.c \
 	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_lower_if_else.c \
+	ir3/ir3_print.c \
 	ir3/ir3_ra.c \
 	ir3/ir3_sched.c \
 	ir3/ir3_shader.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
index e4acc7e95b4..b48fb4659cd 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
@@ -414,32 +414,16 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
 static void
 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
-	switch (inst->Instruction.Saturate) {
-	case TGSI_SAT_NONE:
-		break;
-	case TGSI_SAT_ZERO_ONE:
+	if (inst->Instruction.Saturate) {
 		alu->alu.vector_clamp = true;
-		break;
-	case TGSI_SAT_MINUS_PLUS_ONE:
-		DBG("unsupported saturate");
-		assert(0);
-		break;
 	}
 }
 
 static void
 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
-	switch (inst->Instruction.Saturate) {
-	case TGSI_SAT_NONE:
-		break;
-	case TGSI_SAT_ZERO_ONE:
+	if (inst->Instruction.Saturate) {
 		alu->alu.scalar_clamp = true;
-		break;
-	case TGSI_SAT_MINUS_PLUS_ONE:
-		DBG("unsupported saturate");
-		assert(0);
-		break;
 	}
 }
 
@@ -758,7 +742,7 @@ translate_tex(struct fd2_compile_context *ctx,
 	struct tgsi_src_register tmp_src;
 	const struct tgsi_src_register *coord;
 	bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
-			(inst->Instruction.Saturate != TGSI_SAT_NONE);
+			inst->Instruction.Saturate;
 	int idx;
 
 	if (using_temp || (opc == TGSI_OPCODE_TXP))
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 4e3f521716e..77e4605e550 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -105,9 +105,6 @@ struct fd3_context {
 	 */
 	unsigned fsaturate_s, fsaturate_t, fsaturate_r;
 
-	/* bitmask of integer texture samplers */
-	uint16_t vinteger_s, finteger_s;
-
 	/* some state changes require a different shader variant.  Keep
 	 * track of this so we know when we need to re-emit shader state
 	 * due to variant change.  See fixup_shader_state()
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b522cf86695..b5838b58eb2 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -104,14 +104,12 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key)
 		if (last_key->has_per_samp || key->has_per_samp) {
 			if ((last_key->vsaturate_s != key->vsaturate_s) ||
 					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vinteger_s != key->vinteger_s))
+					(last_key->vsaturate_r != key->vsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_VP;
 
 			if ((last_key->fsaturate_s != key->fsaturate_s) ||
 					(last_key->fsaturate_t != key->fsaturate_t) ||
-					(last_key->fsaturate_r != key->fsaturate_r) ||
-					(last_key->finteger_s != key->finteger_s))
+					(last_key->fsaturate_r != key->fsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
 		}
 
@@ -140,16 +138,13 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate ||
-							 fd3_ctx->vinteger_s || fd3_ctx->finteger_s),
+			.has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate),
 			.vsaturate_s = fd3_ctx->vsaturate_s,
 			.vsaturate_t = fd3_ctx->vsaturate_t,
 			.vsaturate_r = fd3_ctx->vsaturate_r,
 			.fsaturate_s = fd3_ctx->fsaturate_s,
 			.fsaturate_t = fd3_ctx->fsaturate_t,
 			.fsaturate_r = fd3_ctx->fsaturate_r,
-			.vinteger_s = fd3_ctx->vinteger_s,
-			.finteger_s = fd3_ctx->finteger_s,
 		},
 		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
 		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : 0,
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index a6824ef92e7..57fcaa9020e 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -413,12 +413,15 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 				}
 			}
 
-			/* TODO: Figure out if there's a way to make it spit out 0's and
-			 * 1's for the .z and .w components.
+			/* Replace the .xy coordinates with S/T from the point sprite. Set
+			 * interpolation bits for .zw such that they become .01
 			 */
-			if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic)))
+			if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic))) {
 				vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
 					<< ((inloc % 16) * 2);
+				vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
+				vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
+			}
 		}
 
 		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 3497921257c..094dcf376e5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -32,6 +32,7 @@
 #include "fd3_screen.h"
 #include "fd3_context.h"
 #include "fd3_format.h"
+#include "ir3_compiler.h"
 
 static boolean
 fd3_screen_is_format_supported(struct pipe_screen *pscreen,
@@ -103,7 +104,9 @@ fd3_screen_is_format_supported(struct pipe_screen *pscreen,
 void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
-	fd_screen(pscreen)->max_rts = 4;
+	struct fd_screen *screen = fd_screen(pscreen);
+	screen->max_rts = 4;
+	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index 6f44ee3c08e..a278bf5c603 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -263,44 +263,11 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	return &so->base;
 }
 
-static void
-fd3_set_sampler_views(struct pipe_context *pctx, unsigned shader,
-					  unsigned start, unsigned nr,
-					  struct pipe_sampler_view **views)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd3_context *fd3_ctx = fd3_context(ctx);
-	struct fd_texture_stateobj *tex;
-	uint16_t integer_s = 0, *ptr;
-	int i;
-
-	fd_set_sampler_views(pctx, shader, start, nr, views);
-
-	switch (shader) {
-	case PIPE_SHADER_FRAGMENT:
-		tex = &ctx->fragtex;
-		ptr = &fd3_ctx->finteger_s;
-		break;
-	case PIPE_SHADER_VERTEX:
-		tex = &ctx->verttex;
-		ptr = &fd3_ctx->vinteger_s;
-		break;
-	default:
-		return;
-	}
-
-	for (i = 0; i < tex->num_textures; i++)
-		if (util_format_is_pure_integer(tex->textures[i]->format))
-			integer_s |= 1 << i;
-	*ptr = integer_s;
-}
-
-
 void
 fd3_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd3_sampler_state_create;
 	pctx->bind_sampler_states = fd3_sampler_states_bind;
 	pctx->create_sampler_view = fd3_sampler_view_create;
-	pctx->set_sampler_views = fd3_set_sampler_views;
+	pctx->set_sampler_views = fd_set_sampler_views;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 384602a2e4f..53e1bf6a2e6 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -83,9 +83,6 @@ struct fd4_context {
 	 */
 	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
 
-	/* bitmask of integer texture samplers */
-	uint16_t vinteger_s, finteger_s;
-
 	/* some state changes require a different shader variant.  Keep
 	 * track of this so we know when we need to re-emit shader state
 	 * due to variant change.  See fixup_shader_state()
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index ae407f753fe..de5a306af60 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -82,8 +82,7 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key)
 		if (last_key->has_per_samp || key->has_per_samp) {
 			if ((last_key->vsaturate_s != key->vsaturate_s) ||
 					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vinteger_s != key->vinteger_s))
+					(last_key->vsaturate_r != key->vsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_VP;
 
 			if ((last_key->fsaturate_s != key->fsaturate_s) ||
@@ -122,16 +121,13 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate ||
-					fd4_ctx->vinteger_s || fd4_ctx->finteger_s),
+			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate),
 			.vsaturate_s = fd4_ctx->vsaturate_s,
 			.vsaturate_t = fd4_ctx->vsaturate_t,
 			.vsaturate_r = fd4_ctx->vsaturate_r,
 			.fsaturate_s = fd4_ctx->fsaturate_s,
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
-			.vinteger_s = fd4_ctx->vinteger_s,
-			.finteger_s = fd4_ctx->finteger_s,
 		},
 		.format = fd4_emit_format(pfb->cbufs[0]),
 		.pformat = pipe_surface_format(pfb->cbufs[0]),
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index f5b46685bdf..e8cbb2d201a 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -32,6 +32,7 @@
 #include "fd4_screen.h"
 #include "fd4_context.h"
 #include "fd4_format.h"
+#include "ir3_compiler.h"
 
 static boolean
 fd4_screen_is_format_supported(struct pipe_screen *pscreen,
@@ -100,7 +101,9 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen,
 void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
-	fd_screen(pscreen)->max_rts = 1;
+	struct fd_screen *screen = fd_screen(pscreen);
+	screen->max_rts = 1;
+	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index ff1ff8f0d34..6ba25d0816d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -205,43 +205,11 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	return &so->base;
 }
 
-static void
-fd4_set_sampler_views(struct pipe_context *pctx, unsigned shader,
-		unsigned start, unsigned nr, struct pipe_sampler_view **views)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	struct fd_texture_stateobj *tex;
-	uint16_t integer_s = 0, *ptr;
-	int i;
-
-	fd_set_sampler_views(pctx, shader, start, nr, views);
-
-	switch (shader) {
-	case PIPE_SHADER_FRAGMENT:
-		tex = &ctx->fragtex;
-		ptr = &fd4_ctx->finteger_s;
-		break;
-	case PIPE_SHADER_VERTEX:
-		tex = &ctx->verttex;
-		ptr = &fd4_ctx->vinteger_s;
-		break;
-	default:
-		return;
-	}
-
-	for (i = 0; i < tex->num_textures; i++)
-		if (util_format_is_pure_integer(tex->textures[i]->format))
-			integer_s |= 1 << i;
-
-	*ptr = integer_s;
-}
-
 void
 fd4_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd4_sampler_state_create;
 	pctx->bind_sampler_states = fd_sampler_states_bind;
 	pctx->create_sampler_view = fd4_sampler_view_create;
-	pctx->set_sampler_views = fd4_set_sampler_views;
+	pctx->set_sampler_views = fd_set_sampler_views;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 2c816b4b1f6..e420f1e5bd9 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -297,7 +297,7 @@ struct fd_context {
 	 */
 	struct fd_gmem_stateobj gmem;
 	struct fd_vsc_pipe      pipe[8];
-	struct fd_tile          tile[64];
+	struct fd_tile          tile[256];
 
 	/* which state objects need to be re-emit'd: */
 	enum {
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 46b057d9062..375e58f7022 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -35,6 +35,7 @@
 struct pipe_fence_handle {
 	struct pipe_reference reference;
 	struct fd_context *ctx;
+	struct fd_screen *screen;
 	uint32_t timestamp;
 };
 
@@ -68,7 +69,7 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
-	if (fd_pipe_wait(fence->ctx->screen->pipe, fence->timestamp))
+	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
 		return false;
 
 	return true;
@@ -86,6 +87,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
 	pipe_reference_init(&fence->reference, 1);
 
 	fence->ctx = ctx;
+	fence->screen = ctx->screen;
 	fence->timestamp = fd_ringbuffer_timestamp(ctx->ring);
 
 	return fence;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 11a1b62b26b..c105378ec4e 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -117,6 +117,7 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
+	int tile_n[ARRAY_SIZE(ctx->pipe)];
 
 	if (has_zs) {
 		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
@@ -247,6 +248,7 @@ calculate_tiles(struct fd_context *ctx)
 	/* configure tiles: */
 	t = 0;
 	yoff = miny;
+	memset(tile_n, 0, sizeof(tile_n));
 	for (i = 0; i < nbins_y; i++) {
 		uint32_t bw, bh;
 
@@ -257,20 +259,17 @@ calculate_tiles(struct fd_context *ctx)
 
 		for (j = 0; j < nbins_x; j++) {
 			struct fd_tile *tile = &ctx->tile[t];
-			uint32_t n, p;
+			uint32_t p;
 
 			assert(t < ARRAY_SIZE(ctx->tile));
 
 			/* pipe number: */
 			p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
 
-			/* slot number: */
-			n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
-
 			/* clip bin width: */
 			bw = MIN2(bin_w, minx + width - xoff);
 
-			tile->n = n;
+			tile->n = tile_n[p]++;
 			tile->p = p;
 			tile->bin_w = bw;
 			tile->bin_h = bh;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 556c8ab18d4..b3b5462b437 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -68,10 +68,7 @@ static const struct debug_named_value debug_options[] = {
 		{"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		{"optdump",   FD_DBG_OPTDUMP,"Dump shader DAG to .dot files"},
 		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"},
-		{"nocp",      FD_DBG_NOCP,   "Disable copy-propagation"},
-		{"nir",       FD_DBG_NIR,    "Enable experimental NIR compiler"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -220,6 +217,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -374,6 +372,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
 		return 1;
@@ -519,6 +518,7 @@ fd_screen_create(struct fd_device *dev)
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
+	case 307:
 	case 320:
 	case 330:
 		fd3_screen_init(pscreen);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 3b470d1d8a6..dbc2808262a 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -46,7 +46,9 @@ struct fd_screen {
 	uint32_t device_id;
 	uint32_t gpu_id;         /* 220, 305, etc */
 	uint32_t chip_id;        /* coreid:8 majorrev:8 minorrev:8 patch:8 */
-	uint32_t max_rts;
+	uint32_t max_rts;        /* max # of render targets */
+
+	void *compiler;          /* currently unused for a2xx */
 
 	struct fd_device *dev;
 	struct fd_pipe *pipe;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 2735ae41315..deb0e602ce2 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -62,11 +62,8 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBYPASS 0x0040
 #define FD_DBG_FRAGHALF 0x0080
 #define FD_DBG_NOBIN    0x0100
-#define FD_DBG_OPTMSGS  0x0400
-#define FD_DBG_OPTDUMP  0x0800
-#define FD_DBG_GLSL120  0x1000
-#define FD_DBG_NOCP     0x2000
-#define FD_DBG_NIR      0x4000
+#define FD_DBG_OPTMSGS  0x0200
+#define FD_DBG_GLSL120  0x0400
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index a5136c6bd3d..48ae7c71b9f 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -133,16 +133,16 @@ static void print_instr_cat0(instr_t *instr)
 		break;
 	case OPC_BR:
 		printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
-				component[cat0->comp], cat0->immed);
+				component[cat0->comp], cat0->a3xx.immed);
 		break;
 	case OPC_JUMP:
 	case OPC_CALL:
-		printf(" #%d", cat0->immed);
+		printf(" #%d", cat0->a3xx.immed);
 		break;
 	}
 
-	if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
-		printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+	if ((debug & PRINT_VERBOSE) && (cat0->a3xx.dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		printf("\t{0: %x,%x,%x,%x}", cat0->a3xx.dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
 }
 
 static void print_instr_cat1(instr_t *instr)
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index cffa62b6f34..efb07ea479e 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -191,9 +191,9 @@ typedef enum {
 	OPC_LDLV = 31,
 
 	/* meta instructions (category -1): */
-	/* placeholder instr to mark inputs/outputs: */
+	/* placeholder instr to mark shader inputs: */
 	OPC_META_INPUT = 0,
-	OPC_META_OUTPUT = 1,
+	OPC_META_PHI = 1,
 	/* The "fan-in" and "fan-out" instructions are used for keeping
 	 * track of instructions that write to multiple dst registers
 	 * (fan-out) like texture sample instructions, or read multiple
@@ -201,9 +201,6 @@ typedef enum {
 	 */
 	OPC_META_FO = 2,
 	OPC_META_FI = 3,
-	/* branches/flow control */
-	OPC_META_FLOW = 4,
-	OPC_META_PHI = 5,
 
 } opc_t;
 
@@ -281,8 +278,16 @@ static inline int reg_special(reg_t reg)
 
 typedef struct PACKED {
 	/* dword0: */
-	int16_t  immed    : 16;
-	uint32_t dummy1   : 16;
+	union PACKED {
+		struct PACKED {
+			int16_t  immed    : 16;
+			uint32_t dummy1   : 16;
+		} a3xx;
+		struct PACKED {
+			int32_t  immed    : 20;
+			uint32_t dummy1   : 12;
+		} a4xx;
+	};
 
 	/* dword1: */
 	uint32_t dummy2   : 8;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index e015de91c33..a166b67d7cf 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -66,11 +66,22 @@ void * ir3_alloc(struct ir3 *shader, int sz)
 	return ptr;
 }
 
-struct ir3 * ir3_create(void)
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout)
 {
-	struct ir3 *shader =
-			calloc(1, sizeof(struct ir3));
+	struct ir3 *shader = calloc(1, sizeof(struct ir3));
+
 	grow_heap(shader);
+
+	shader->compiler = compiler;
+	shader->ninputs = nin;
+	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+	shader->noutputs = nout;
+	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+	list_inithead(&shader->block_list);
+
 	return shader;
 }
 
@@ -81,7 +92,8 @@ void ir3_destroy(struct ir3 *shader)
 		shader->chunk = chunk->next;
 		free(chunk);
 	}
-	free(shader->instrs);
+	free(shader->indirects);
+	free(shader->predicates);
 	free(shader->baryfs);
 	free(shader);
 }
@@ -142,7 +154,11 @@ static int emit_cat0(struct ir3_instruction *instr, void *ptr,
 {
 	instr_cat0_t *cat0 = ptr;
 
-	cat0->immed    = instr->cat0.immed;
+	if (info->gpu_id >= 400) {
+		cat0->a4xx.immed = instr->cat0.immed;
+	} else {
+		cat0->a3xx.immed = instr->cat0.immed;
+	}
 	cat0->repeat   = instr->repeat;
 	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
 	cat0->inv      = instr->cat0.inv;
@@ -535,32 +551,40 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 		uint32_t gpu_id)
 {
 	uint32_t *ptr, *dwords;
-	uint32_t i;
 
+	info->gpu_id        = gpu_id;
 	info->max_reg       = -1;
 	info->max_half_reg  = -1;
 	info->max_const     = -1;
 	info->instrs_count  = 0;
+	info->sizedwords    = 0;
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			info->sizedwords += 2;
+		}
+	}
 
 	/* need a integer number of instruction "groups" (sets of 16
 	 * instructions on a4xx or sets of 4 instructions on a3xx),
 	 * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
 	 */
 	if (gpu_id >= 400) {
-		info->sizedwords = 2 * align(shader->instrs_count, 16);
+		info->sizedwords = align(info->sizedwords, 16 * 2);
 	} else {
-		info->sizedwords = 2 * align(shader->instrs_count, 4);
+		info->sizedwords = align(info->sizedwords, 4 * 2);
 	}
 
 	ptr = dwords = calloc(4, info->sizedwords);
 
-	for (i = 0; i < shader->instrs_count; i++) {
-		struct ir3_instruction *instr = shader->instrs[i];
-		int ret = emit[instr->category](instr, dwords, info);
-		if (ret)
-			goto fail;
-		info->instrs_count += 1 + instr->repeat;
-		dwords += 2;
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			int ret = emit[instr->category](instr, dwords, info);
+			if (ret)
+				goto fail;
+			info->instrs_count += 1 + instr->repeat;
+			dwords += 2;
+		}
 	}
 
 	return ptr;
@@ -581,50 +605,30 @@ static struct ir3_register * reg_create(struct ir3 *shader,
 	return reg;
 }
 
-static void insert_instr(struct ir3 *shader,
+static void insert_instr(struct ir3_block *block,
 		struct ir3_instruction *instr)
 {
+	struct ir3 *shader = block->shader;
 #ifdef DEBUG
 	static uint32_t serialno = 0;
 	instr->serialno = ++serialno;
 #endif
-	array_insert(shader->instrs, instr);
+	list_addtail(&instr->node, &block->instr_list);
 
 	if (is_input(instr))
 		array_insert(shader->baryfs, instr);
 }
 
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-		unsigned ntmp, unsigned nin, unsigned nout)
+struct ir3_block * ir3_block_create(struct ir3 *shader)
 {
-	struct ir3_block *block;
-	unsigned size;
-	char *ptr;
-
-	size = sizeof(*block);
-	size += sizeof(block->temporaries[0]) * ntmp;
-	size += sizeof(block->inputs[0]) * nin;
-	size += sizeof(block->outputs[0]) * nout;
-
-	ptr = ir3_alloc(shader, size);
-
-	block = (void *)ptr;
-	ptr += sizeof(*block);
-
-	block->temporaries = (void *)ptr;
-	block->ntemporaries = ntmp;
-	ptr += sizeof(block->temporaries[0]) * ntmp;
-
-	block->inputs = (void *)ptr;
-	block->ninputs = nin;
-	ptr += sizeof(block->inputs[0]) * nin;
-
-	block->outputs = (void *)ptr;
-	block->noutputs = nout;
-	ptr += sizeof(block->outputs[0]) * nout;
-
+	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+	static uint32_t serialno = 0;
+	block->serialno = ++serialno;
+#endif
 	block->shader = shader;
-
+	list_inithead(&block->node);
+	list_inithead(&block->instr_list);
 	return block;
 }
 
@@ -652,7 +656,7 @@ struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
 	instr->block = block;
 	instr->category = category;
 	instr->opc = opc;
-	insert_instr(block->shader, instr);
+	insert_instr(block, instr);
 	return instr;
 }
 
@@ -677,7 +681,7 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 	*new_instr = *instr;
 	new_instr->regs = regs;
 
-	insert_instr(instr->block->shader, new_instr);
+	insert_instr(instr->block, new_instr);
 
 	/* clone registers: */
 	new_instr->regs_count = 0;
@@ -694,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags)
 {
-	struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+	struct ir3 *shader = instr->block->shader;
+	struct ir3_register *reg = reg_create(shader, num, flags);
 #ifdef DEBUG
 	debug_assert(instr->regs_count < instr->regs_max);
 #endif
 	instr->regs[instr->regs_count++] = reg;
 	return reg;
 }
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+		instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ir3_block_clear_mark(block);
+	}
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+void
+ir3_count_instructions(struct ir3 *ir)
+{
+	unsigned ip = 0;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->ip = ip++;
+		}
+		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index c0a14a07d48..9c35a763d58 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -28,17 +28,20 @@
 #include <stdbool.h>
 
 #include "util/u_debug.h"
+#include "util/list.h"
 
 #include "instr-a3xx.h"
 #include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
 
 /* low level intermediate representation of an adreno shader program */
 
+struct ir3_compiler;
 struct ir3;
 struct ir3_instruction;
 struct ir3_block;
 
 struct ir3_info {
+	uint32_t gpu_id;
 	uint16_t sizedwords;
 	uint16_t instrs_count;   /* expanded to account for rpt's */
 	/* NOTE: max_reg, etc, does not include registers not touched
@@ -80,8 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_IA     = 0x4000,   /* meta-input dst is "assigned" */
-		IR3_REG_ADDR   = 0x8000,   /* register is a0.x */
+		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+
 	} flags;
 	union {
 		/* normal registers:
@@ -185,6 +188,7 @@ struct ir3_instruction {
 			char inv;
 			char comp;
 			int  immed;
+			struct ir3_block *target;
 		} cat0;
 		struct {
 			type_t src_type, dst_type;
@@ -218,14 +222,14 @@ struct ir3_instruction {
 			int aid;
 		} fi;
 		struct {
-			struct ir3_block *if_block, *else_block;
-		} flow;
+			/* used to temporarily hold reference to nir_phi_instr
+			 * until we resolve the phi srcs
+			 */
+			void *nphi;
+		} phi;
 		struct {
 			struct ir3_block *block;
 		} inout;
-
-		/* XXX keep this as big as all other union members! */
-		uint32_t info[3];
 	};
 
 	/* transient values used during various algorithms: */
@@ -243,6 +247,13 @@ struct ir3_instruction {
 		 */
 #define DEPTH_UNUSED  ~0
 		unsigned depth;
+		/* When we get to the RA stage, we no longer need depth, but
+		 * we do need instruction's position/name:
+		 */
+		struct {
+			uint16_t ip;
+			uint16_t name;
+		};
 	};
 
 	/* Used during CP and RA stages.  For fanin and shader inputs/
@@ -290,7 +301,9 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *fanin;
 
-	struct ir3_instruction *next;
+	/* Entry in ir3_block's instruction list: */
+	struct list_head node;
+
 #ifdef DEBUG
 	uint32_t serialno;
 #endif
@@ -321,8 +334,11 @@ static inline int ir3_neighbor_count(struct ir3_instruction *instr)
 struct ir3_heap_chunk;
 
 struct ir3 {
-	unsigned instrs_count, instrs_sz;
-	struct ir3_instruction **instrs;
+	struct ir3_compiler *compiler;
+
+	unsigned ninputs, noutputs;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
 
 	/* Track bary.f (and ldlv) instructions.. this is needed in
 	 * scheduling to ensure that all varying fetches happen before
@@ -345,33 +361,54 @@ struct ir3 {
 	 */
 	unsigned indirects_count, indirects_sz;
 	struct ir3_instruction **indirects;
+	/* and same for instructions that consume predicate register: */
+	unsigned predicates_count, predicates_sz;
+	struct ir3_instruction **predicates;
+
+	/* List of blocks: */
+	struct list_head block_list;
 
-	struct ir3_block *block;
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
 
+typedef struct nir_block nir_block;
+
 struct ir3_block {
+	struct list_head node;
 	struct ir3 *shader;
-	unsigned ntemporaries, ninputs, noutputs;
-	/* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
-	struct ir3_instruction **temporaries;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction **outputs;
-	/* only a single address register: */
-	struct ir3_instruction *address;
-	struct ir3_block *parent;
-	struct ir3_instruction *head;
+
+	nir_block *nblock;
+
+	struct list_head instr_list;  /* list of ir3_instruction */
+
+	/* each block has either one or two successors.. in case of
+	 * two successors, 'condition' decides which one to follow.
+	 * A block preceding an if/else has two successors.
+	 */
+	struct ir3_instruction *condition;
+	struct ir3_block *successors[2];
+
+	uint16_t start_ip, end_ip;
+
+	/* used for per-pass extra block data.  Mainly used right
+	 * now in RA step to track livein/liveout.
+	 */
+	void *bd;
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
 };
 
-struct ir3 * ir3_create(void);
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout);
 void ir3_destroy(struct ir3 *shader);
 void * ir3_assemble(struct ir3 *shader,
 		struct ir3_info *info, uint32_t gpu_id);
 void * ir3_alloc(struct ir3 *shader, int sz);
 
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-		unsigned ntmp, unsigned nin, unsigned nout);
+struct ir3_block * ir3_block_create(struct ir3 *shader);
 
 struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
 		int category, opc_t opc);
@@ -383,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
-
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
@@ -392,22 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 	return false;
 }
 
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
-	/* TODO would be nice to drop the instruction array.. for
-	 * new compiler, _clear_mark() is all we use it for, and
-	 * we could probably manage a linked list instead..
-	 *
-	 * Also, we'll probably want to mark instructions within
-	 * a block, so tracking the list of instrs globally is
-	 * unlikely to be what we want.
-	 */
-	unsigned i;
-	for (i = 0; i < shader->instrs_count; i++) {
-		struct ir3_instruction *instr = shader->instrs[i];
-		instr->flags &= ~IR3_INSTR_MARK;
-	}
-}
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+void ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
@@ -501,6 +525,28 @@ static inline bool is_mem(struct ir3_instruction *instr)
 	return (instr->category == 6);
 }
 
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+	if (is_mem(instr)) {
+		/* these instructions, the "destination" register is
+		 * actually a source, the address to store to.
+		 */
+		switch (instr->opc) {
+		case OPC_STG:
+		case OPC_STP:
+		case OPC_STL:
+		case OPC_STLW:
+		case OPC_L2G:
+		case OPC_G2L:
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
 	/* in some cases, ldlv is used to fetch varying without
@@ -525,7 +571,7 @@ static inline bool writes_addr(struct ir3_instruction *instr)
 {
 	if (instr->regs_count > 0) {
 		struct ir3_register *dst = instr->regs[0];
-		return !!(dst->flags & IR3_REG_ADDR);
+		return reg_num(dst) == REG_A0;
 	}
 	return false;
 }
@@ -556,13 +602,29 @@ static inline bool conflicts(struct ir3_instruction *a,
 
 static inline bool reg_gpr(struct ir3_register *r)
 {
-	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_ADDR))
+	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 		return false;
 	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
 		return false;
 	return true;
 }
 
+static inline type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
 /* some cat2 instructions (ie. those which are not float) can embed an
  * immediate:
  */
@@ -747,37 +809,31 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
 
 
 /* dump: */
-#include <stdio.h>
-void ir3_dump(struct ir3 *shader, const char *name,
-		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-		FILE *f);
-void ir3_dump_instr_single(struct ir3_instruction *instr);
-void ir3_dump_instr_list(struct ir3_instruction *instr);
-
-/* flatten if/else: */
-int ir3_block_flatten(struct ir3_block *block);
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
 
 /* depth calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n);
-void ir3_block_depth(struct ir3_block *block);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
 
 /* copy-propagate: */
-void ir3_block_cp(struct ir3_block *block);
+void ir3_cp(struct ir3 *ir);
 
-/* group neightbors and insert mov's to resolve conflicts: */
-void ir3_block_group(struct ir3_block *block);
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
 
 /* scheduling: */
-int ir3_block_sched(struct ir3_block *block);
+int ir3_sched(struct ir3 *ir);
 
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(void *memctx);
+int ir3_ra(struct ir3 *ir3, enum shader_t type,
 		bool frag_coord, bool frag_face);
 
 /* legalize: */
-void ir3_block_legalize(struct ir3_block *block,
-		bool *has_samp, int *max_bary);
+void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary);
 
 /* ************************************************************************* */
 /* instruction helpers */
@@ -807,6 +863,21 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 	return instr;
 }
 
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+	return ir3_instr_create(block, 0, OPC_NOP);
+}
+
+#define INSTR0(CAT, name)                                                \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, CAT, OPC_##name);                        \
+	return instr;                                                        \
+}
+
 #define INSTR1(CAT, name)                                                \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
@@ -850,7 +921,10 @@ ir3_##name(struct ir3_block *block,                                      \
 }
 
 /* cat0 instructions: */
+INSTR0(0, BR);
+INSTR0(0, JUMP);
 INSTR1(0, KILL);
+INSTR0(0, END);
 
 /* cat2 instructions, most 2 src but some 1 src: */
 INSTR2(2, ADD_F)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d0517aab8ce..ad9d2719d59 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -30,6 +30,7 @@
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <err.h>
 
 #include "tgsi/tgsi_parse.h"
@@ -65,34 +66,34 @@ static void dump_info(struct ir3_shader_variant *so, const char *str)
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		struct ir3_block *block = so->ir->block;
+		struct ir3 *ir = so->ir;
 		struct ir3_register *reg;
 		uint8_t regid;
 		unsigned i;
 
 		debug_printf("; %s: %s\n", type, str);
 
-		for (i = 0; i < block->ninputs; i++) {
-			if (!block->inputs[i]) {
+		for (i = 0; i < ir->ninputs; i++) {
+			if (!ir->inputs[i]) {
 				debug_printf("; in%d unused\n", i);
 				continue;
 			}
-			reg = block->inputs[i]->regs[0];
+			reg = ir->inputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@in(%sr%d.%c)\tin%d\n",
 					(reg->flags & IR3_REG_HALF) ? "h" : "",
 					(regid >> 2), "xyzw"[regid & 0x3], i);
 		}
 
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i]) {
+		for (i = 0; i < ir->noutputs; i++) {
+			if (!ir->outputs[i]) {
 				debug_printf("; out%d unused\n", i);
 				continue;
 			}
 			/* kill shows up as a virtual output.. skip it! */
-			if (is_kill(block->outputs[i]))
+			if (is_kill(ir->outputs[i]))
 				continue;
-			reg = block->outputs[i]->regs[0];
+			reg = ir->outputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@out(%sr%d.%c)\tout%d\n",
 					(reg->flags & IR3_REG_HALF) ? "h" : "",
@@ -194,16 +195,6 @@ read_file(const char *filename, void **ptr, size_t *size)
 	return 0;
 }
 
-static void reset_variant(struct ir3_shader_variant *v, const char *msg)
-{
-	printf("; %s\n", msg);
-	v->inputs_count = 0;
-	v->outputs_count = 0;
-	v->total_in = 0;
-	v->has_samp = false;
-	v->immediates_count = 0;
-}
-
 static void print_usage(void)
 {
 	printf("Usage: ir3_compiler [OPTIONS]... FILE\n");
@@ -225,12 +216,12 @@ int main(int argc, char **argv)
 	const char *filename;
 	struct tgsi_token toks[65536];
 	struct tgsi_parse_context parse;
+	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
 	struct ir3_shader_key key = {};
 	const char *info;
 	void *ptr;
 	size_t size;
-	int use_nir = 0;
 
 	fd_mesa_debug |= FD_DBG_DISASM;
 
@@ -243,7 +234,7 @@ int main(int argc, char **argv)
 
 	while (n < argc) {
 		if (!strcmp(argv[n], "--verbose")) {
-			fd_mesa_debug |=  FD_DBG_OPTDUMP | FD_DBG_MSGS | FD_DBG_OPTMSGS;
+			fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS;
 			n++;
 			continue;
 		}
@@ -290,17 +281,6 @@ int main(int argc, char **argv)
 			continue;
 		}
 
-		if (!strcmp(argv[n], "--nocp")) {
-			fd_mesa_debug |= FD_DBG_NOCP;
-			n++;
-			continue;
-		}
-		if (!strcmp(argv[n], "--nir")) {
-			use_nir = true;
-			n++;
-			continue;
-		}
-
 		if (!strcmp(argv[n], "--help")) {
 			print_usage();
 			return 0;
@@ -340,31 +320,14 @@ int main(int argc, char **argv)
 		break;
 	}
 
-	if (use_nir) {
-		info = "NIR compiler";
-		ret = ir3_compile_shader_nir(&v, toks, key);
-	} else {
-		info = "TGSI compiler";
-		ret = ir3_compile_shader(&v, toks, key, true);
-	}
-
-	if (ret) {
-		reset_variant(&v, "compiler failed, trying without copy propagation!");
-		info = "compiler (no copy propagation)";
-		ret = ir3_compile_shader(&v, toks, key, false);
-	}
+	/* TODO cmdline option to target different gpus: */
+	compiler = ir3_compiler_create(320);
 
+	info = "NIR compiler";
+	ret = ir3_compile_shader_nir(compiler, &v, toks, key);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
 	}
 	dump_info(&v, info);
 }
-
-void _mesa_error_no_memory(const char *caller);
-
-void
-_mesa_error_no_memory(const char *caller)
-{
-	fprintf(stderr, "Mesa error: out of memory in %s", caller);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index 43f4c955ac0..7c8eccb54e1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -1,7 +1,7 @@
 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
 
 /*
- * Copyright (C) 2013 Rob Clark <[email protected]>
+ * Copyright (C) 2015 Rob Clark <[email protected]>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,3710 +26,19 @@
  *    Rob Clark <[email protected]>
  */
 
-#include <stdarg.h>
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_scan.h"
-
-#include "freedreno_util.h"
+#include "util/ralloc.h"
 
 #include "ir3_compiler.h"
-#include "ir3_shader.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-struct ir3_compile_context {
-	const struct tgsi_token *tokens;
-	bool free_tokens;
-	struct ir3 *ir;
-	struct ir3_shader_variant *so;
-	uint16_t integer_s;
-
-	struct ir3_block *block;
-	struct ir3_instruction *current_instr;
-
-	/* we need to defer updates to block->outputs[] until the end
-	 * of an instruction (so we don't see new value until *after*
-	 * the src registers are processed)
-	 */
-	struct {
-		struct ir3_instruction *instr, **instrp;
-	} output_updates[64];
-	unsigned num_output_updates;
-
-	/* are we in a sequence of "atomic" instructions?
-	 */
-	bool atomic;
-
-	/* For fragment shaders, from the hw perspective the only
-	 * actual input is r0.xy position register passed to bary.f.
-	 * But TGSI doesn't know that, it still declares things as
-	 * IN[] registers.  So we do all the input tracking normally
-	 * and fix things up after compile_instructions()
-	 *
-	 * NOTE that frag_pos is the hardware position (possibly it
-	 * is actually an index or tag or some such.. it is *not*
-	 * values that can be directly used for gl_FragCoord..)
-	 */
-	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
-
-	/* For vertex shaders, keep track of the system values sources */
-	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
-
-	struct tgsi_parse_context parser;
-	unsigned type;
-
-	struct tgsi_shader_info info;
-
-	/* hmm, would be nice if tgsi_scan_shader figured this out
-	 * for us:
-	 */
-	struct {
-		unsigned first, last;
-		struct ir3_instruction *fanin;
-	} array[MAX_ARRAYS];
-	uint32_t array_dirty;
-	/* offset into array[], per file, of first array info */
-	uint8_t array_offsets[TGSI_FILE_COUNT];
-
-	/* for calculating input/output positions/linkages: */
-	unsigned next_inloc;
-
-	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
-	 * so we need to use ldlv.u32 to load the varying directly:
-	 */
-	bool flat_bypass;
-
-	unsigned num_internal_temps;
-	struct tgsi_src_register internal_temps[8];
-
-	/* for looking up which system value is which */
-	unsigned sysval_semantics[8];
-
-	/* idx/slot for last compiler generated immediate */
-	unsigned immediate_idx;
-
-	/* stack of branch instructions that mark (potentially nested)
-	 * branch if/else/loop/etc
-	 */
-	struct {
-		struct ir3_instruction *instr, *cond;
-		bool inv;   /* true iff in else leg of branch */
-	} branch[16];
-	unsigned int branch_count;
-
-	/* list of kill instructions: */
-	struct ir3_instruction *kill[16];
-	unsigned int kill_count;
-
-	/* used when dst is same as one of the src, to avoid overwriting a
-	 * src element before the remaining scalar instructions that make
-	 * up the vector operation
-	 */
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-
-	/* just for catching incorrect use of get_dst()/put_dst():
-	 */
-	bool using_tmp_dst;
-};
-
-
-static void vectorize(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
-		int nsrcs, ...);
-static void create_mov(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
-static type_t get_ftype(struct ir3_compile_context *ctx);
-static type_t get_utype(struct ir3_compile_context *ctx);
-
-static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i)
-{
-	/* ArrayID 0 for a given file is the legacy array spanning the entire file: */
-	ctx->array[i].first = 0;
-	ctx->array[i].last = ctx->info.file_max[file];
-	ctx->array_offsets[file] = i;
-	i += ctx->info.array_max[file] + 1;
-	return i;
-}
-
-static unsigned
-compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens)
-{
-	unsigned ret, i;
-	struct tgsi_shader_info *info = &ctx->info;
-	struct tgsi_lowering_config lconfig = {
-			.color_two_side = so->key.color_two_side,
-			.lower_DST  = true,
-			.lower_XPD  = true,
-			.lower_SCS  = true,
-			.lower_LRP  = true,
-			.lower_FRC  = true,
-			.lower_POW  = true,
-			.lower_LIT  = true,
-			.lower_EXP  = true,
-			.lower_LOG  = true,
-			.lower_DP4  = true,
-			.lower_DP3  = true,
-			.lower_DPH  = true,
-			.lower_DP2  = true,
-			.lower_DP2A = true,
-	};
-
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		lconfig.saturate_s = so->key.fsaturate_s;
-		lconfig.saturate_t = so->key.fsaturate_t;
-		lconfig.saturate_r = so->key.fsaturate_r;
-		ctx->integer_s = so->key.finteger_s;
-		break;
-	case SHADER_VERTEX:
-		lconfig.saturate_s = so->key.vsaturate_s;
-		lconfig.saturate_t = so->key.vsaturate_t;
-		lconfig.saturate_r = so->key.vsaturate_r;
-		ctx->integer_s = so->key.vinteger_s;
-		break;
-	}
-
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		lconfig.lower_TXP = ~0;  /* lower all txp */
-		/* need special handling for "flat" */
-		ctx->flat_bypass = true;
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
-		/* no special handling for "flat" */
-		ctx->flat_bypass = false;
-	}
-
-	ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
-	ctx->free_tokens = !!ctx->tokens;
-	if (!ctx->tokens) {
-		/* no lowering */
-		ctx->tokens = tokens;
-	}
-	ctx->ir = so->ir;
-	ctx->so = so;
-	ctx->array_dirty = 0;
-	ctx->next_inloc = 8;
-	ctx->num_internal_temps = 0;
-	ctx->branch_count = 0;
-	ctx->kill_count = 0;
-	ctx->block = NULL;
-	ctx->current_instr = NULL;
-	ctx->num_output_updates = 0;
-	ctx->atomic = false;
-	ctx->frag_pos = NULL;
-	ctx->frag_face = NULL;
-	ctx->vertex_id = NULL;
-	ctx->instance_id = NULL;
-	ctx->tmp_src = NULL;
-	ctx->using_tmp_dst = false;
-
-	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
-	memset(ctx->array, 0, sizeof(ctx->array));
-	memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
-
-#define FM(x) (1 << TGSI_FILE_##x)
-	/* NOTE: if relative addressing is used, we set constlen in
-	 * the compiler (to worst-case value) since we don't know in
-	 * the assembler what the max addr reg value can be:
-	 */
-	if (info->indirect_files & FM(CONSTANT))
-		so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1);
-
-	i = 0;
-	i += setup_arrays(ctx, TGSI_FILE_INPUT, i);
-	i += setup_arrays(ctx, TGSI_FILE_TEMPORARY, i);
-	i += setup_arrays(ctx, TGSI_FILE_OUTPUT, i);
-	/* any others? we don't track arrays for const..*/
-
-	/* Immediates go after constants: */
-	so->first_immediate = so->first_driver_param =
-		info->const_file_max[0] + 1;
-	/* 1 unit for the vertex id base */
-	if (so->type == SHADER_VERTEX)
-		so->first_immediate++;
-	/* 4 (vec4) units for ubo base addresses */
-	so->first_immediate += 4;
-	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
-
-	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
-	if (ret != TGSI_PARSE_OK)
-		return ret;
-
-	ctx->type = ctx->parser.FullHeader.Processor.Processor;
-
-	return ret;
-}
-
-static void
-compile_error(struct ir3_compile_context *ctx, const char *format, ...)
-{
-	va_list ap;
-	va_start(ap, format);
-	_debug_vprintf(format, ap);
-	va_end(ap);
-	tgsi_dump(ctx->tokens, 0);
-	debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-	} while (0)
-
-static void
-compile_free(struct ir3_compile_context *ctx)
-{
-	if (ctx->free_tokens)
-		free((void *)ctx->tokens);
-	tgsi_parse_free(&ctx->parser);
-}
-
-struct instr_translater {
-	void (*fxn)(const struct instr_translater *t,
-			struct ir3_compile_context *ctx,
-			struct tgsi_full_instruction *inst);
-	unsigned tgsi_opc;
-	opc_t opc;
-	opc_t hopc;    /* opc to use for half_precision mode, if different */
-	unsigned arg;
-};
-
-static void
-instr_finish(struct ir3_compile_context *ctx)
-{
-	unsigned i;
-
-	if (ctx->atomic)
-		return;
-
-	for (i = 0; i < ctx->num_output_updates; i++)
-		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
-
-	ctx->num_output_updates = 0;
-
-	while (ctx->array_dirty) {
-		unsigned aid = ffs(ctx->array_dirty) - 1;
-		ctx->array[aid].fanin = NULL;
-		ctx->array_dirty &= ~(1 << aid);
-	}
-}
-
-/* For "atomic" groups of instructions, for example the four scalar
- * instructions to perform a vec4 operation.  Basically this just
- * blocks out handling of output_updates so the next scalar instruction
- * still sees the result from before the start of the atomic group.
- *
- * NOTE: when used properly, this could probably replace get/put_dst()
- * stuff.
- */
-static void
-instr_atomic_start(struct ir3_compile_context *ctx)
-{
-	ctx->atomic = true;
-}
-
-static void
-instr_atomic_end(struct ir3_compile_context *ctx)
-{
-	ctx->atomic = false;
-	instr_finish(ctx);
-}
-
-static struct ir3_instruction *
-instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
-{
-	instr_finish(ctx);
-	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
-}
-
-static struct ir3_block *
-push_block(struct ir3_compile_context *ctx)
-{
-	struct ir3_block *block;
-	unsigned ntmp, nin, nout;
-
-#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
-
-	/* hmm, give ourselves room to create 8 extra temporaries (vec4):
-	 */
-	ntmp = SCALAR_REGS(TEMPORARY);
-	ntmp += 8 * 4;
-
-	nout = SCALAR_REGS(OUTPUT);
-	nin  = SCALAR_REGS(INPUT) + SCALAR_REGS(SYSTEM_VALUE);
-
-	/* for outermost block, 'inputs' are the actual shader INPUT
-	 * register file.  Reads from INPUT registers always go back to
-	 * top block.  For nested blocks, 'inputs' is used to track any
-	 * TEMPORARY file register from one of the enclosing blocks that
-	 * is ready in this block.
-	 */
-	if (!ctx->block) {
-		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
-		 * position)
-		 */
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			int n = 2;
-			if (ctx->info.reads_position)
-				n += 4;
-			if (ctx->info.uses_frontface)
-				n += 4;
-			nin = MAX2(n, nin);
-			nout += ARRAY_SIZE(ctx->kill);
-		}
-	} else {
-		nin = ntmp;
-	}
-
-	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
-
-	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
-		block->noutputs -= ARRAY_SIZE(ctx->kill);
-
-	block->parent = ctx->block;
-	ctx->block = block;
-
-	return block;
-}
-
-static void
-pop_block(struct ir3_compile_context *ctx)
-{
-	ctx->block = ctx->block->parent;
-	compile_assert(ctx, ctx->block);
-}
-
-static struct ir3_instruction *
-create_output(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
-{
-	struct ir3_instruction *out;
-
-	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
-	out->inout.block = block;
-	ir3_reg_create(out, n, 0);
-	if (instr)
-		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
-
-	return out;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
-{
-	struct ir3_instruction *in;
-
-	in = ir3_instr_create(block, -1, OPC_META_INPUT);
-	in->inout.block = block;
-	ir3_reg_create(in, n, 0);
-	if (instr)
-		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
-
-	return in;
-}
-
-static struct ir3_instruction *
-block_input(struct ir3_block *block, unsigned n)
-{
-	/* references to INPUT register file always go back up to
-	 * top level:
-	 */
-	if (block->parent)
-		return block_input(block->parent, n);
-	return block->inputs[n];
-}
-
-/* return temporary in scope, creating if needed meta-input node
- * to track block inputs
- */
-static struct ir3_instruction *
-block_temporary(struct ir3_block *block, unsigned n)
-{
-	/* references to TEMPORARY register file, find the nearest
-	 * enclosing block which has already assigned this temporary,
-	 * creating meta-input instructions along the way to keep
-	 * track of block inputs
-	 */
-	if (block->parent && !block->temporaries[n]) {
-		/* if already have input for this block, reuse: */
-		if (!block->inputs[n])
-			block->inputs[n] = block_temporary(block->parent, n);
-
-		/* and create new input to return: */
-		return create_input(block, block->inputs[n], n);
-	}
-	return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-create_immed(struct ir3_compile_context *ctx, float val)
-{
-	/* NOTE: *don't* use instr_create() here!
-	 */
-	struct ir3_instruction *instr;
-	instr = ir3_instr_create(ctx->block, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = get_ftype(ctx);
-	ir3_reg_create(instr, 0, 0);
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
-	return instr;
-}
-
-static void
-ssa_instr_set(struct ir3_compile_context *ctx, unsigned file, unsigned n,
-		struct ir3_instruction *instr)
-{
-	struct ir3_block *block = ctx->block;
-	unsigned idx = ctx->num_output_updates;
-
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
-
-	/* NOTE: defer update of temporaries[idx] or output[idx]
-	 * until instr_finish(), so that if the current instruction
-	 * reads the same TEMP/OUT[] it gets the old value:
-	 *
-	 * bleh.. this might be a bit easier to just figure out
-	 * in instr_finish().  But at that point we've already
-	 * lost information about OUTPUT vs TEMPORARY register
-	 * file..
-	 */
-
-	switch (file) {
-	case TGSI_FILE_OUTPUT:
-		compile_assert(ctx, n < block->noutputs);
-		ctx->output_updates[idx].instrp = &block->outputs[n];
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	case TGSI_FILE_TEMPORARY:
-		compile_assert(ctx, n < block->ntemporaries);
-		ctx->output_updates[idx].instrp = &block->temporaries[n];
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	case TGSI_FILE_ADDRESS:
-		compile_assert(ctx, n < 1);
-		ctx->output_updates[idx].instrp = &block->address;
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	}
-}
-
-static struct ir3_instruction *
-ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *instr = NULL;
-
-	switch (file) {
-	case TGSI_FILE_INPUT:
-		instr = block_input(ctx->block, n);
-		break;
-	case TGSI_FILE_OUTPUT:
-		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
-		 * for the following clamp instructions:
-		 */
-		instr = block->outputs[n];
-		/* we don't have to worry about read from an OUTPUT that was
-		 * assigned outside of the current block, because the _SAT
-		 * clamp instructions will always be in the same block as
-		 * the original instruction which wrote the OUTPUT
-		 */
-		compile_assert(ctx, instr);
-		break;
-	case TGSI_FILE_TEMPORARY:
-		instr = block_temporary(ctx->block, n);
-		if (!instr) {
-			/* this can happen when registers (or components of a TGSI
-			 * register) are used as src before they have been assigned
-			 * (undefined contents).  To avoid confusing the rest of the
-			 * compiler, and to generally keep things peachy, substitute
-			 * an instruction that sets the src to 0.0.  Or to keep
-			 * things undefined, I could plug in a random number? :-P
-			 *
-			 * NOTE: *don't* use instr_create() here!
-			 */
-			instr = create_immed(ctx, 0.0);
-			/* no need to recreate the immed for every access: */
-			block->temporaries[n] = instr;
-		}
-		break;
-	case TGSI_FILE_SYSTEM_VALUE:
-		switch (ctx->sysval_semantics[n >> 2]) {
-		case TGSI_SEMANTIC_VERTEXID_NOBASE:
-			instr = ctx->vertex_id;
-			break;
-		case TGSI_SEMANTIC_BASEVERTEX:
-			instr = ctx->basevertex;
-			break;
-		case TGSI_SEMANTIC_INSTANCEID:
-			instr = ctx->instance_id;
-			break;
-		}
-		break;
-	}
-
-	return instr;
-}
-
-static int dst_array_id(struct ir3_compile_context *ctx,
-		const struct tgsi_dst_register *dst)
-{
-	// XXX complete hack to recover tgsi_full_dst_register...
-	// nothing that isn't wrapped in a tgsi_full_dst_register
-	// should be indirect
-	const struct tgsi_full_dst_register *fdst = (const void *)dst;
-	return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
-}
-
-static int src_array_id(struct ir3_compile_context *ctx,
-		const struct tgsi_src_register *src)
-{
-	// XXX complete hack to recover tgsi_full_src_register...
-	// nothing that isn't wrapped in a tgsi_full_src_register
-	// should be indirect
-	const struct tgsi_full_src_register *fsrc = (const void *)src;
-	debug_assert(src->File != TGSI_FILE_CONSTANT);
-	return fsrc->Indirect.ArrayID + ctx->array_offsets[src->File];
-}
-
-static struct ir3_instruction *
-array_fanin(struct ir3_compile_context *ctx, unsigned aid, unsigned file)
-{
-	struct ir3_instruction *instr;
-
-	if (ctx->array[aid].fanin) {
-		instr = ctx->array[aid].fanin;
-	} else {
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned i, j;
-
-		instr = ir3_instr_create2(ctx->block, -1, OPC_META_FI,
-				1 + (4 * (last + 1 - first)));
-		ir3_reg_create(instr, 0, 0);
-		for (i = first; i <= last; i++) {
-			for (j = 0; j < 4; j++) {
-				unsigned n = regid(i, j);
-				ir3_reg_create(instr, 0, IR3_REG_SSA)->instr =
-						ssa_instr_get(ctx, file, n);
-			}
-		}
-		ctx->array[aid].fanin = instr;
-		ctx->array_dirty |= (1 << aid);
-	}
-
-	return instr;
-}
-
-static void
-ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_dst_register *dst, unsigned chan)
-{
-	if (dst->Indirect) {
-		struct ir3_register *reg = instr->regs[0];
-		unsigned i, aid = dst_array_id(ctx, dst);
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned off   = dst->Index - first; /* vec4 offset */
-
-		reg->size = 4 * (1 + last - first);
-		reg->offset = regid(off, chan);
-
-		instr->fanin = array_fanin(ctx, aid, dst->File);
-
-		/* annotate with the array-id, to help out the register-
-		 * assignment stage.  At least for the case of indirect
-		 * writes, we should capture enough dependencies to
-		 * preserve the order of reads/writes of the array, so
-		 * the multiple "names" for the array should end up all
-		 * assigned to the same registers.
-		 */
-		instr->fanin->fi.aid = aid;
-
-		/* Since we are scalarizing vec4 tgsi instructions/regs, we
-		 * run into a slight complication here.  To do the naive thing
-		 * and setup a fanout for each scalar array element would end
-		 * up with the result that the instructions generated for each
-		 * component of the vec4 would end up clobbering each other.
-		 * So we take advantage here of knowing that the array index
-		 * (after the shl.b) will be a multiple of four, and only set
-		 * every fourth scalar component in the array.  See also
-		 * fixup_ssa_dst_array()
-		 */
-		for (i = first; i <= last; i++) {
-			struct ir3_instruction *split;
-			unsigned n = regid(i, chan);
-			int off = (4 * (i - first)) + chan;
-
-			if (is_meta(instr) && (instr->opc == OPC_META_FO))
-				off -= instr->fo.off;
-
-			split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
-			split->fo.off = off;
-			ir3_reg_create(split, 0, 0);
-			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
-
-			ssa_instr_set(ctx, dst->File, n, split);
-		}
-	} else {
-		/* normal case (not relative addressed GPR) */
-		ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
-	}
-}
-
-static void
-ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
-		const struct tgsi_src_register *src, unsigned chan)
-{
-	struct ir3_instruction *instr;
-
-	if (src->Indirect && (src->File != TGSI_FILE_CONSTANT)) {
-		/* for relative addressing of gpr's (due to register assignment)
-		 * we must generate a fanin instruction to collect all possible
-		 * array elements that the instruction could address together:
-		 */
-		unsigned aid   = src_array_id(ctx, src);
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned off   = src->Index - first; /* vec4 offset */
-
-		reg->size = 4 * (1 + last - first);
-		reg->offset = regid(off, chan);
-
-		instr = array_fanin(ctx, aid, src->File);
-	} else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) {
-		const struct tgsi_full_src_register *fsrc = (const void *)src;
-		struct ir3_instruction *temp = NULL;
-		int ubo_regid = regid(ctx->so->first_driver_param, 0) +
-			fsrc->Dimension.Index - 1;
-		int offset = 0;
-
-		/* We don't handle indirect UBO array accesses... yet. */
-		compile_assert(ctx, !fsrc->Dimension.Indirect);
-		/* UBOs start at index 1. */
-		compile_assert(ctx, fsrc->Dimension.Index > 0);
-
-		if (src->Indirect) {
-			/* In case of an indirect index, it will have been loaded into an
-			 * address register. There will be a sequence of
-			 *
-			 *   shl.b x, val, 2
-			 *   mova a0, x
-			 *
-			 * We rely on this sequence to get the original val out and shift
-			 * it by 4, since we're dealing in vec4 units.
-			 */
-			compile_assert(ctx, ctx->block->address);
-			compile_assert(ctx, ctx->block->address->regs[1]->instr->opc ==
-						   OPC_SHL_B);
-
-			temp = instr = instr_create(ctx, 2, OPC_SHL_B);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr =
-				ctx->block->address->regs[1]->instr->regs[1]->instr;
-			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
-		} else if (src->Index >= 64) {
-			/* Otherwise it's a plain index (in vec4 units). Move it into a
-			 * register.
-			 */
-			temp = instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = get_utype(ctx);
-			instr->cat1.dst_type = get_utype(ctx);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16;
-		} else {
-			/* The offset is small enough to fit into the ldg instruction
-			 * directly.
-			 */
-			offset = src->Index * 16;
-		}
-
-		if (temp) {
-			/* If there was an offset (most common), add it to the buffer
-			 * address.
-			 */
-			instr = instr_create(ctx, 2, OPC_ADD_S);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
-			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
-		} else {
-			/* Otherwise just load the buffer address directly */
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = get_utype(ctx);
-			instr->cat1.dst_type = get_utype(ctx);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
-		}
-
-		temp = instr;
-
-		instr = instr_create(ctx, 6, OPC_LDG);
-		instr->cat6.type = TYPE_U32;
-		instr->cat6.offset = offset + chan * 4;
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-
-		reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST);
-	} else {
-		/* normal case (not relative addressed GPR) */
-		instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan));
-	}
-
-	if (instr) {
-		reg->flags |= IR3_REG_SSA;
-		reg->instr = instr;
-	} else if (reg->flags & IR3_REG_SSA) {
-		/* special hack for trans_samp() which calls ssa_src() directly
-		 * to build up the collect (fanin) for const src.. (so SSA flag
-		 * set but no src instr... it basically gets lucky because we
-		 * default to 0.0 for "undefined" src instructions, which is
-		 * what it wants.  We probably need to give it a better way to
-		 * do this, but for now this hack:
-		 */
-		reg->instr = create_immed(ctx, 0.0);
-	}
-}
-
-static struct ir3_register *
-add_dst_reg_wrmask(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
-		unsigned chan, unsigned wrmask)
-{
-	unsigned flags = 0, num = 0;
-	struct ir3_register *reg;
-
-	switch (dst->File) {
-	case TGSI_FILE_OUTPUT:
-	case TGSI_FILE_TEMPORARY:
-		/* uses SSA */
-		break;
-	case TGSI_FILE_ADDRESS:
-		flags |= IR3_REG_ADDR;
-		/* uses SSA */
-		break;
-	default:
-		compile_error(ctx, "unsupported dst register file: %s\n",
-			tgsi_file_name(dst->File));
-		break;
-	}
-
-	if (dst->Indirect) {
-		flags |= IR3_REG_RELATIV;
-
-		/* shouldn't happen, and we can't cope with it below: */
-		compile_assert(ctx, wrmask == 0x1);
-
-		compile_assert(ctx, ctx->block->address);
-		if (instr->address)
-			compile_assert(ctx, ctx->block->address == instr->address);
-
-		instr->address = ctx->block->address;
-		array_insert(ctx->ir->indirects, instr);
-	}
-
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
-	reg->wrmask = wrmask;
-
-	if (wrmask == 0x1) {
-		/* normal case */
-		ssa_dst(ctx, instr, dst, chan);
-	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
-			(dst->File == TGSI_FILE_OUTPUT) ||
-			(dst->File == TGSI_FILE_ADDRESS)) {
-		struct ir3_instruction *prev = NULL;
-		unsigned i;
-
-		compile_assert(ctx, !dst->Indirect);
-
-		/* if instruction writes multiple, we need to create
-		 * some place-holder collect the registers:
-		 */
-		for (i = 0; i < 4; i++) {
-			/* NOTE: slightly ugly that we setup neighbor ptrs
-			 * for FO here, but handle FI in CP pass.. we should
-			 * probably just always setup neighbor ptrs in the
-			 * frontend?
-			 */
-			struct ir3_instruction *split =
-					ir3_instr_create(ctx->block, -1, OPC_META_FO);
-			split->fo.off = i;
-			/* unused dst reg: */
-			/* NOTE: set SSA flag on dst here, because unused FO's
-			 * which don't get scheduled will end up not in the
-			 * instruction list when RA sets SSA flag on each dst.
-			 * Slight hack.  We really should set SSA flag on
-			 * every dst register in the frontend.
-			 */
-			ir3_reg_create(split, 0, IR3_REG_SSA);
-			/* and src reg used to hold original instr */
-			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
-			if (prev) {
-				split->cp.left = prev;
-				split->cp.left_cnt++;
-				prev->cp.right = split;
-				prev->cp.right_cnt++;
-			}
-			if ((wrmask & (1 << i)) && !ctx->atomic)
-				ssa_dst(ctx, split, dst, chan+i);
-			prev = split;
-		}
-	}
-
-	return reg;
-}
-
-static struct ir3_register *
-add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_dst_register *dst, unsigned chan)
-{
-	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
-}
-
-static struct ir3_register *
-add_src_reg_wrmask(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, const struct tgsi_src_register *src,
-		unsigned chan, unsigned wrmask)
-{
-	unsigned flags = 0, num = 0;
-	struct ir3_register *reg;
-
-	switch (src->File) {
-	case TGSI_FILE_IMMEDIATE:
-		/* TODO if possible, use actual immediate instead of const.. but
-		 * TGSI has vec4 immediates, we can only embed scalar (of limited
-		 * size, depending on instruction..)
-		 */
-		flags |= IR3_REG_CONST;
-		num = src->Index + ctx->so->first_immediate;
-		break;
-	case TGSI_FILE_CONSTANT:
-		flags |= IR3_REG_CONST;
-		num = src->Index;
-		break;
-	case TGSI_FILE_OUTPUT:
-		/* NOTE: we should only end up w/ OUTPUT file for things like
-		 * clamp()'ing saturated dst instructions
-		 */
-	case TGSI_FILE_INPUT:
-	case TGSI_FILE_TEMPORARY:
-	case TGSI_FILE_SYSTEM_VALUE:
-		/* uses SSA */
-		break;
-	default:
-		compile_error(ctx, "unsupported src register file: %s\n",
-			tgsi_file_name(src->File));
-		break;
-	}
-
-	/* We seem to have 8 bits (6.2) for dst register always, so I think
-	 * it is safe to assume GPR cannot be >=64
-	 *
-	 * cat3 instructions only have 8 bits for src2, but cannot take a
-	 * const for src2
-	 *
-	 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
-	 * const for any src.
-	 *
-	 * Other than that we seem to have 12 bits to encode const src,
-	 * except for cat1 which may only have 11 bits (but that seems like
-	 * a bug)
-	 */
-	if (flags & IR3_REG_CONST)
-		compile_assert(ctx, src->Index < (1 << 9));
-	else
-		compile_assert(ctx, src->Index < (1 << 6));
-
-	/* NOTE: abs/neg modifiers in tgsi only apply to float */
-	if (src->Absolute)
-		flags |= IR3_REG_FABS;
-	if (src->Negate)
-		flags |= IR3_REG_FNEG;
-
-	if (src->Indirect) {
-		flags |= IR3_REG_RELATIV;
-
-		/* shouldn't happen, and we can't cope with it below: */
-		compile_assert(ctx, wrmask == 0x1);
-
-		compile_assert(ctx, ctx->block->address);
-		if (instr->address)
-			compile_assert(ctx, ctx->block->address == instr->address);
-
-		instr->address = ctx->block->address;
-		array_insert(ctx->ir->indirects, instr);
-	}
-
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
-	reg->wrmask = wrmask;
-
-	if (wrmask == 0x1) {
-		/* normal case */
-		ssa_src(ctx, reg, src, chan);
-	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
-			(src->File == TGSI_FILE_OUTPUT) ||
-			(src->File == TGSI_FILE_INPUT)) {
-		struct ir3_instruction *collect;
-		unsigned i;
-
-		compile_assert(ctx, !src->Indirect);
-
-		/* if instruction reads multiple, we need to create
-		 * some place-holder collect the registers:
-		 */
-		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
-
-		for (i = 0; i < 4; i++) {
-			if (wrmask & (1 << i)) {
-				/* and src reg used point to the original instr */
-				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-						src, chan + i);
-			} else if (wrmask & ~((i << i) - 1)) {
-				/* if any remaining components, then dummy
-				 * placeholder src reg to fill in the blanks:
-				 */
-				ir3_reg_create(collect, 0, 0);
-			}
-		}
-
-		reg->flags |= IR3_REG_SSA;
-		reg->instr = collect;
-	}
-
-	return reg;
-}
-
-static struct ir3_register *
-add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_src_register *src, unsigned chan)
-{
-	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
-}
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-	src->File      = dst->File;
-	src->Indirect  = dst->Indirect;
-	src->Dimension = dst->Dimension;
-	src->Index     = dst->Index;
-	src->Absolute  = 0;
-	src->Negate    = 0;
-	src->SwizzleX  = TGSI_SWIZZLE_X;
-	src->SwizzleY  = TGSI_SWIZZLE_Y;
-	src->SwizzleZ  = TGSI_SWIZZLE_Z;
-	src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *tmp_dst)
-{
-	struct tgsi_src_register *tmp_src;
-	int n;
-
-	tmp_dst->File      = TGSI_FILE_TEMPORARY;
-	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-	tmp_dst->Indirect  = 0;
-	tmp_dst->Dimension = 0;
-
-	/* assign next temporary: */
-	n = ctx->num_internal_temps++;
-	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
-	tmp_src = &ctx->internal_temps[n];
-
-	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
-
-	src_from_dst(tmp_src, tmp_dst);
-
-	return tmp_src;
-}
-
-static inline bool
-is_const(struct tgsi_src_register *src)
-{
-	return (src->File == TGSI_FILE_CONSTANT) ||
-			(src->File == TGSI_FILE_IMMEDIATE);
-}
-
-static inline bool
-is_relative(struct tgsi_src_register *src)
-{
-	return src->Indirect;
-}
-
-static inline bool
-is_rel_or_const(struct tgsi_src_register *src)
-{
-	return is_relative(src) || is_const(src);
-}
-
-static type_t
-get_ftype(struct ir3_compile_context *ctx)
-{
-	return TYPE_F32;
-}
-
-static type_t
-get_utype(struct ir3_compile_context *ctx)
-{
-	return TYPE_U32;
-}
-
-static type_t
-get_stype(struct ir3_compile_context *ctx)
-{
-	return TYPE_S32;
-}
-
-static unsigned
-src_swiz(struct tgsi_src_register *src, int chan)
-{
-	switch (chan) {
-	case 0: return src->SwizzleX;
-	case 1: return src->SwizzleY;
-	case 2: return src->SwizzleZ;
-	case 3: return src->SwizzleW;
-	}
-	assert(0);
-	return 0;
-}
-
-/* for instructions that cannot take a const register as src, if needed
- * generate a move to temporary gpr:
- */
-static struct tgsi_src_register *
-get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
-{
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-
-	compile_assert(ctx, is_rel_or_const(src));
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	create_mov(ctx, &tmp_dst, src);
-
-	return tmp_src;
-}
-
-static void
-get_immediate(struct ir3_compile_context *ctx,
-		struct tgsi_src_register *reg, uint32_t val)
-{
-	unsigned neg, swiz, idx, i;
-	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-	static const unsigned swiz2tgsi[] = {
-			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-	};
-
-	for (i = 0; i < ctx->immediate_idx; i++) {
-		swiz = i % 4;
-		idx  = i / 4;
-
-		if (ctx->so->immediates[idx].val[swiz] == val) {
-			neg = 0;
-			break;
-		}
-
-		if (ctx->so->immediates[idx].val[swiz] == -val) {
-			neg = 1;
-			break;
-		}
-	}
-
-	if (i == ctx->immediate_idx) {
-		/* need to generate a new immediate: */
-		swiz = i % 4;
-		idx  = i / 4;
-		neg  = 0;
-		ctx->so->immediates[idx].val[swiz] = val;
-		ctx->so->immediates_count = idx + 1;
-		ctx->immediate_idx++;
-	}
-
-	reg->File      = TGSI_FILE_IMMEDIATE;
-	reg->Indirect  = 0;
-	reg->Dimension = 0;
-	reg->Index     = idx;
-	reg->Absolute  = 0;
-	reg->Negate    = neg;
-	reg->SwizzleX  = swiz2tgsi[swiz];
-	reg->SwizzleY  = swiz2tgsi[swiz];
-	reg->SwizzleZ  = swiz2tgsi[swiz];
-	reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-static void
-create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
-		struct tgsi_src_register *src)
-{
-	type_t type_mov = get_ftype(ctx);
-	unsigned i;
-
-	for (i = 0; i < 4; i++) {
-		/* move to destination: */
-		if (dst->WriteMask & (1 << i)) {
-			struct ir3_instruction *instr;
-
-			if (src->Absolute || src->Negate) {
-				/* can't have abs or neg on a mov instr, so use
-				 * absneg.f instead to handle these cases:
-				 */
-				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-			} else {
-				instr = instr_create(ctx, 1, 0);
-				instr->cat1.src_type = type_mov;
-				instr->cat1.dst_type = type_mov;
-			}
-
-			add_dst_reg(ctx, instr, dst, i);
-			add_src_reg(ctx, instr, src, src_swiz(src, i));
-		}
-	}
-}
-
-static void
-create_clamp(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
-		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
-{
-	struct ir3_instruction *instr;
-
-	instr = instr_create(ctx, 2, OPC_MAX_F);
-	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
-
-	instr = instr_create(ctx, 2, OPC_MIN_F);
-	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
-}
-
-static void
-create_clamp_imm(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst,
-		uint32_t minval, uint32_t maxval)
-{
-	struct tgsi_src_register minconst, maxconst;
-	struct tgsi_src_register src;
-
-	src_from_dst(&src, dst);
-
-	get_immediate(ctx, &minconst, minval);
-	get_immediate(ctx, &maxconst, maxval);
-
-	create_clamp(ctx, dst, &src, &minconst, &maxconst);
-}
-
-static struct tgsi_dst_register *
-get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	unsigned i;
-
-	compile_assert(ctx, !ctx->using_tmp_dst);
-	ctx->using_tmp_dst = true;
-
-	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		struct tgsi_src_register *src = &inst->Src[i].Register;
-		if ((src->File == dst->File) && (src->Index == dst->Index)) {
-			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
-					(src->SwizzleX == TGSI_SWIZZLE_X) &&
-					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
-					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
-					(src->SwizzleW == TGSI_SWIZZLE_W))
-				continue;
-			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
-			ctx->tmp_dst.WriteMask = dst->WriteMask;
-			dst = &ctx->tmp_dst;
-			break;
-		}
-	}
-	return dst;
-}
-
-static void
-put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
-		struct tgsi_dst_register *dst)
-{
-	compile_assert(ctx, ctx->using_tmp_dst);
-	ctx->using_tmp_dst = false;
-
-	/* if necessary, add mov back into original dst: */
-	if (dst != &inst->Dst[0].Register) {
-		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
-	}
-}
-
-/* helper to generate the necessary repeat and/or additional instructions
- * to turn a scalar instruction into a vector operation:
- */
-static void
-vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		struct tgsi_dst_register *dst, int nsrcs, ...)
-{
-	va_list ap;
-	int i, j, n = 0;
-
-	instr_atomic_start(ctx);
-
-	for (i = 0; i < 4; i++) {
-		if (dst->WriteMask & (1 << i)) {
-			struct ir3_instruction *cur;
-
-			if (n++ == 0) {
-				cur = instr;
-			} else {
-				cur = instr_create(ctx, instr->category, instr->opc);
-				memcpy(cur->info, instr->info, sizeof(cur->info));
-			}
-
-			add_dst_reg(ctx, cur, dst, i);
-
-			va_start(ap, nsrcs);
-			for (j = 0; j < nsrcs; j++) {
-				struct tgsi_src_register *src =
-						va_arg(ap, struct tgsi_src_register *);
-				unsigned flags = va_arg(ap, unsigned);
-				struct ir3_register *reg;
-				if (flags & IR3_REG_IMMED) {
-					reg = ir3_reg_create(cur, 0, IR3_REG_IMMED);
-					/* this is an ugly cast.. should have put flags first! */
-					reg->iim_val = *(int *)&src;
-				} else {
-					reg = add_src_reg(ctx, cur, src, src_swiz(src, i));
-				}
-				reg->flags |= flags & ~(IR3_REG_FNEG | IR3_REG_SNEG);
-				if (flags & IR3_REG_FNEG)
-					reg->flags ^= IR3_REG_FNEG;
-				if (flags & IR3_REG_SNEG)
-					reg->flags ^= IR3_REG_SNEG;
-			}
-			va_end(ap);
-		}
-	}
-
-	instr_atomic_end(ctx);
-}
-
-/*
- * Handlers for TGSI instructions which do not have a 1:1 mapping to
- * native instructions:
- */
-
-static void
-trans_clamp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct tgsi_src_register *src2 = &inst->Src[2].Register;
-
-	create_clamp(ctx, dst, src0, src1, src2);
-
-	put_dst(ctx, inst, dst);
-}
-
-/* ARL(x) = x, but mova from hrN.x to a0.. */
-static void
-trans_arl(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	unsigned chan = src->SwizzleX;
-
-	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
-
-	/* NOTE: we allocate a temporary from a flat register
-	 * namespace (ignoring half vs full).  It turns out
-	 * not to really matter since registers get reassigned
-	 * later in ir3_ra which (hopefully!) can deal a bit
-	 * better with mixed half and full precision.
-	 */
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	/* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
-			get_ftype(ctx) : get_utype(ctx);
-	instr->cat1.dst_type = TYPE_S16;
-	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, src, chan);
-
-	/* shl.b Rtmp, Rtmp, 2 */
-	instr = instr_create(ctx, 2, OPC_SHL_B);
-	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-
-	/* mova a0, Rtmp */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = TYPE_S16;
-	instr->cat1.dst_type = TYPE_S16;
-	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-}
-
-/*
- * texture fetch/sample instructions:
- */
-
-struct tex_info {
-	int8_t order[4];
-	int8_t args;
-	unsigned src_wrmask, flags;
-};
-
-struct target_info {
-	uint8_t dims;
-	uint8_t cube;
-	uint8_t array;
-	uint8_t shadow;
-};
-
-static const struct target_info tex_targets[] = {
-	[TGSI_TEXTURE_1D]               = { 1, 0, 0, 0 },
-	[TGSI_TEXTURE_2D]               = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_3D]               = { 3, 0, 0, 0 },
-	[TGSI_TEXTURE_CUBE]             = { 3, 1, 0, 0 },
-	[TGSI_TEXTURE_RECT]             = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_SHADOW1D]         = { 1, 0, 0, 1 },
-	[TGSI_TEXTURE_SHADOW2D]         = { 2, 0, 0, 1 },
-	[TGSI_TEXTURE_SHADOWRECT]       = { 2, 0, 0, 1 },
-	[TGSI_TEXTURE_1D_ARRAY]         = { 1, 0, 1, 0 },
-	[TGSI_TEXTURE_2D_ARRAY]         = { 2, 0, 1, 0 },
-	[TGSI_TEXTURE_SHADOW1D_ARRAY]   = { 1, 0, 1, 1 },
-	[TGSI_TEXTURE_SHADOW2D_ARRAY]   = { 2, 0, 1, 1 },
-	[TGSI_TEXTURE_SHADOWCUBE]       = { 3, 1, 0, 1 },
-	[TGSI_TEXTURE_2D_MSAA]          = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_2D_ARRAY_MSAA]    = { 2, 0, 1, 0 },
-	[TGSI_TEXTURE_CUBE_ARRAY]       = { 3, 1, 1, 0 },
-	[TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
-};
-
-static void
-fill_tex_info(struct ir3_compile_context *ctx,
-			  struct tgsi_full_instruction *inst,
-			  struct tex_info *info)
-{
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-
-	if (tgt->dims == 3)
-		info->flags |= IR3_INSTR_3D;
-	if (tgt->array)
-		info->flags |= IR3_INSTR_A;
-	if (tgt->shadow)
-		info->flags |= IR3_INSTR_S;
-
-	switch (inst->Instruction.Opcode) {
-	case TGSI_OPCODE_TXB:
-	case TGSI_OPCODE_TXB2:
-	case TGSI_OPCODE_TXL:
-	case TGSI_OPCODE_TXF:
-		info->args = 2;
-		break;
-	case TGSI_OPCODE_TXP:
-		info->flags |= IR3_INSTR_P;
-		/* fallthrough */
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXD:
-		info->args = 1;
-		break;
-	}
-
-	/*
-	 * lay out the first argument in the proper order:
-	 *  - actual coordinates first
-	 *  - shadow reference
-	 *  - array index
-	 *  - projection w
-	 *
-	 * bias/lod go into the second arg
-	 */
-	int arg, pos = 0;
-	for (arg = 0; arg < tgt->dims; arg++)
-		info->order[arg] = pos++;
-	if (tgt->dims == 1)
-		info->order[pos++] = -1;
-	if (tgt->shadow)
-		info->order[pos++] = MAX2(arg + tgt->array, 2);
-	if (tgt->array)
-		info->order[pos++] = arg++;
-	if (info->flags & IR3_INSTR_P)
-		info->order[pos++] = 3;
-
-	info->src_wrmask = (1 << pos) - 1;
-
-	for (; pos < 4; pos++)
-		info->order[pos] = -1;
-
-	assert(pos <= 4);
-}
-
-static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
-{
-	unsigned i;
-	for (i = 1; (i < 4) && order[i] >= 0; i++)
-		if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
-			return false;
-	return true;
-}
-
-static bool is_1d(unsigned tex)
-{
-	return tex_targets[tex].dims == 1;
-}
-
-static struct tgsi_src_register *
-get_tex_coord(struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst,
-		const struct tex_info *tinf)
-{
-	struct tgsi_src_register *coord = &inst->Src[0].Register;
-	struct ir3_instruction *instr;
-	unsigned tex = inst->Texture.Texture;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	type_t type_mov = get_ftype(ctx);
-	unsigned j;
-
-	/* need to move things around: */
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	for (j = 0; j < 4; j++) {
-		if (tinf->order[j] < 0)
-			continue;
-		instr = instr_create(ctx, 1, 0);  /* mov */
-		instr->cat1.src_type = type_mov;
-		instr->cat1.dst_type = type_mov;
-		add_dst_reg(ctx, instr, &tmp_dst, j);
-		add_src_reg(ctx, instr, coord,
-				src_swiz(coord, tinf->order[j]));
-	}
-
-	/* fix up .y coord: */
-	if (is_1d(tex)) {
-		struct ir3_register *imm;
-		instr = instr_create(ctx, 1, 0);  /* mov */
-		instr->cat1.src_type = type_mov;
-		instr->cat1.dst_type = type_mov;
-		add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
-		imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
-		if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
-			imm->iim_val = 0;
-		else
-			imm->fim_val = 0.5;
-	}
-
-	return tmp_src;
-}
-
-static void
-trans_samp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *collect;
-	struct ir3_register *reg;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
-	struct tgsi_src_register zero;
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-	struct tex_info tinf;
-	int i;
-
-	memset(&tinf, 0, sizeof(tinf));
-	fill_tex_info(ctx, inst, &tinf);
-	coord = get_tex_coord(ctx, inst, &tinf);
-	get_immediate(ctx, &zero, 0);
-
-	switch (inst->Instruction.Opcode) {
-	case TGSI_OPCODE_TXB2:
-		orig = &inst->Src[1].Register;
-		samp = &inst->Src[2].Register;
-		break;
-	case TGSI_OPCODE_TXD:
-		orig = &inst->Src[0].Register;
-		dpdx = &inst->Src[1].Register;
-		dpdy = &inst->Src[2].Register;
-		samp = &inst->Src[3].Register;
-		if (is_rel_or_const(dpdx))
-				dpdx = get_unconst(ctx, dpdx);
-		if (is_rel_or_const(dpdy))
-				dpdy = get_unconst(ctx, dpdy);
-		break;
-	default:
-		orig = &inst->Src[0].Register;
-		samp = &inst->Src[1].Register;
-		break;
-	}
-	if (tinf.args > 1 && is_rel_or_const(orig))
-		orig = get_unconst(ctx, orig);
-
-	/* scale up integer coords for TXF based on the LOD */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-		type_t type_mov = get_utype(ctx);
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		for (i = 0; i < tgt->dims; i++) {
-			instr = instr_create(ctx, 2, OPC_SHL_B);
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
-			add_src_reg(ctx, instr, orig, orig->SwizzleW);
-		}
-		if (tgt->dims < 2) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, &zero, 0);
-			i++;
-		}
-		if (tgt->array) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
-		}
-		coord = tmp_src;
-	}
-
-	if (inst->Texture.NumOffsets) {
-		struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
-		struct tgsi_src_register offset_src = {0};
-
-		offset_src.File = tex_offset->File;
-		offset_src.Index = tex_offset->Index;
-		offset_src.SwizzleX = tex_offset->SwizzleX;
-		offset_src.SwizzleY = tex_offset->SwizzleY;
-		offset_src.SwizzleZ = tex_offset->SwizzleZ;
-		offset = get_unconst(ctx, &offset_src);
-		tinf.flags |= IR3_INSTR_O;
-	}
-
-	instr = instr_create(ctx, 5, t->opc);
-	if (ctx->integer_s & (1 << samp->Index))
-		instr->cat5.type = get_utype(ctx);
-	else
-		instr->cat5.type = get_ftype(ctx);
-	instr->cat5.samp = samp->Index;
-	instr->cat5.tex  = samp->Index;
-	instr->flags |= tinf.flags;
-
-	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
-
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
-
-	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 12);
-	ir3_reg_create(collect, 0, 0);
-	for (i = 0; i < 4; i++) {
-		if (tinf.src_wrmask & (1 << i))
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-					coord, src_swiz(coord, i));
-		else if (tinf.src_wrmask & ~((1 << i) - 1))
-			ir3_reg_create(collect, 0, 0);
-	}
-
-	/* Attach derivatives onto the end of the fan-in. Derivatives start after
-	 * the 4th argument, so make sure that fi is padded up to 4 first.
-	 */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
-		while (collect->regs_count < 5)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
-	}
-
-	reg->instr = collect;
-	reg->wrmask = tinf.src_wrmask;
-
-	/* The second argument contains the offsets, followed by the lod/bias
-	 * argument. This is constructed more manually due to the dynamic nature.
-	 */
-	if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
-		return;
-
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
-
-	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 5);
-	ir3_reg_create(collect, 0, 0);
-
-	if (inst->Texture.NumOffsets) {
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-					offset, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-	}
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
-		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-				orig, orig->SwizzleX);
-	else if (tinf.args > 1)
-		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-				orig, orig->SwizzleW);
-
-	reg->instr = collect;
-	reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
-}
-
-static void
-trans_txq(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *level = &inst->Src[0].Register;
-	struct tgsi_src_register *samp = &inst->Src[1].Register;
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-	struct tex_info tinf;
-
-	memset(&tinf, 0, sizeof(tinf));
-	fill_tex_info(ctx, inst, &tinf);
-	if (is_rel_or_const(level))
-		level = get_unconst(ctx, level);
-
-	instr = instr_create(ctx, 5, OPC_GETSIZE);
-	instr->cat5.type = get_utype(ctx);
-	instr->cat5.samp = samp->Index;
-	instr->cat5.tex  = samp->Index;
-	instr->flags |= tinf.flags;
-
-	if (tgt->array && (dst->WriteMask & (1 << tgt->dims))) {
-		/* Array size actually ends up in .w rather than .z. This doesn't
-		 * matter for miplevel 0, but for higher mips the value in z is
-		 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-		 * returned, which means that we have to add 1 to it for arrays.
-		 */
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-		type_t type_mov = get_utype(ctx);
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0,
-						   dst->WriteMask | TGSI_WRITEMASK_W);
-		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
-
-		if (dst->WriteMask & TGSI_WRITEMASK_X) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, dst, 0);
-			add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 0));
-		}
-
-		if (tgt->dims == 2) {
-			if (dst->WriteMask & TGSI_WRITEMASK_Y) {
-				instr = instr_create(ctx, 1, 0);
-				instr->cat1.src_type = type_mov;
-				instr->cat1.dst_type = type_mov;
-				add_dst_reg(ctx, instr, dst, 1);
-				add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 1));
-			}
-		}
-
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		add_dst_reg(ctx, instr, dst, tgt->dims);
-		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 3));
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-	} else {
-		add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
-		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
-	}
-
-	if (dst->WriteMask & TGSI_WRITEMASK_W) {
-		/* The # of levels comes from getinfo.z. We need to add 1 to it, since
-		 * the value in TEX_CONST_0 is zero-based.
-		 */
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		instr = instr_create(ctx, 5, OPC_GETINFO);
-		instr->cat5.type = get_utype(ctx);
-		instr->cat5.samp = samp->Index;
-		instr->cat5.tex  = samp->Index;
-		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0, TGSI_WRITEMASK_Z);
-
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		add_dst_reg(ctx, instr, dst, 3);
-		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 2));
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-	}
-}
-
-/* DDX/DDY */
-static void
-trans_deriv(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	static const int8_t order[4] = {0, 1, 2, 3};
-
-	if (!check_swiz(src, order)) {
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		create_mov(ctx, &tmp_dst, src);
-
-		src = tmp_src;
-	}
-
-	/* This might be a workaround for hw bug?  Blob compiler always
-	 * seems to work two components at a time for dsy/dsx.  It does
-	 * actually seem to work in some cases (or at least some piglit
-	 * tests) for four components at a time.  But seems more reliable
-	 * to split this into two instructions like the blob compiler
-	 * does:
-	 */
-
-	instr = instr_create(ctx, 5, t->opc);
-	instr->cat5.type = get_ftype(ctx);
-	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
-	add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
-
-	instr = instr_create(ctx, 5, t->opc);
-	instr->cat5.type = get_ftype(ctx);
-	add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
-	add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
-}
-
-/*
- * SEQ(a,b) = (a == b) ? 1.0 : 0.0
- *   cmps.f.eq tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SNE(a,b) = (a != b) ? 1.0 : 0.0
- *   cmps.f.ne tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGE(a,b) = (a >= b) ? 1.0 : 0.0
- *   cmps.f.ge tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLE(a,b) = (a <= b) ? 1.0 : 0.0
- *   cmps.f.le tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGT(a,b) = (a > b)  ? 1.0 : 0.0
- *   cmps.f.gt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLT(a,b) = (a < b)  ? 1.0 : 0.0
- *   cmps.f.lt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * CMP(a,b,c) = (a < 0.0) ? b : c
- *   cmps.f.lt tmp0, a, {0.0}
- *   sel.b16 dst, b, tmp0, c
- */
-static void
-trans_cmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register constval0;
-	/* final instruction for CMP() uses orig src1 and src2: */
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a0, *a1, *a2;
-	unsigned condition;
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_FSEQ:
-		condition = IR3_COND_EQ;
-		break;
-	case TGSI_OPCODE_SNE:
-	case TGSI_OPCODE_FSNE:
-		condition = IR3_COND_NE;
-		break;
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_FSGE:
-		condition = IR3_COND_GE;
-		break;
-	case TGSI_OPCODE_SLT:
-	case TGSI_OPCODE_FSLT:
-		condition = IR3_COND_LT;
-		break;
-	case TGSI_OPCODE_SLE:
-		condition = IR3_COND_LE;
-		break;
-	case TGSI_OPCODE_SGT:
-		condition = IR3_COND_GT;
-		break;
-	case TGSI_OPCODE_CMP:
-		get_immediate(ctx, &constval0, fui(0.0));
-		a0 = &inst->Src[0].Register;  /* a */
-		a1 = &constval0;              /* {0.0} */
-		condition = IR3_COND_LT;
-		break;
-	default:
-		compile_assert(ctx, 0);
-		return;
-	}
-
-	if (is_const(a0) && is_const(a1))
-		a0 = get_unconst(ctx, a0);
-
-	/* cmps.f.<cond> tmp, a0, a1 */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = condition;
-	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_SLE:
-	case TGSI_OPCODE_SNE:
-	case TGSI_OPCODE_SGT:
-	case TGSI_OPCODE_SLT:
-		/* cov.u16f16 dst, tmp0 */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = get_utype(ctx);
-		instr->cat1.dst_type = get_ftype(ctx);
-		vectorize(ctx, instr, dst, 1, tmp_src, 0);
-		break;
-	case TGSI_OPCODE_FSEQ:
-	case TGSI_OPCODE_FSGE:
-	case TGSI_OPCODE_FSNE:
-	case TGSI_OPCODE_FSLT:
-		/* absneg.s dst, (neg)tmp0 */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
-		break;
-	case TGSI_OPCODE_CMP:
-		a1 = &inst->Src[1].Register;
-		a2 = &inst->Src[2].Register;
-		/* sel.{b32,b16} dst, src2, tmp, src1 */
-		instr = instr_create(ctx, 3, OPC_SEL_B32);
-		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
 
-		break;
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * USNE(a,b) = (a != b) ? ~0 : 0
- *   cmps.u32.ne dst, a, b
- *
- * USEQ(a,b) = (a == b) ? ~0 : 0
- *   cmps.u32.eq dst, a, b
- *
- * ISGE(a,b) = (a > b) ? ~0 : 0
- *   cmps.s32.ge dst, a, b
- *
- * USGE(a,b) = (a > b) ? ~0 : 0
- *   cmps.u32.ge dst, a, b
- *
- * ISLT(a,b) = (a < b) ? ~0 : 0
- *   cmps.s32.lt dst, a, b
- *
- * USLT(a,b) = (a < b) ? ~0 : 0
- *   cmps.u32.lt dst, a, b
- *
- */
-static void
-trans_icmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register *a0, *a1;
-	unsigned condition;
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_USNE:
-		condition = IR3_COND_NE;
-		break;
-	case TGSI_OPCODE_USEQ:
-		condition = IR3_COND_EQ;
-		break;
-	case TGSI_OPCODE_ISGE:
-	case TGSI_OPCODE_USGE:
-		condition = IR3_COND_GE;
-		break;
-	case TGSI_OPCODE_ISLT:
-	case TGSI_OPCODE_USLT:
-		condition = IR3_COND_LT;
-		break;
-
-	default:
-		compile_assert(ctx, 0);
-		return;
-	}
-
-	if (is_const(a0) && is_const(a1))
-		a0 = get_unconst(ctx, a0);
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-	/* cmps.{u32,s32}.<cond> tmp, a0, a1 */
-	instr = instr_create(ctx, 2, t->opc);
-	instr->cat2.condition = condition;
-	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-	/* absneg.s dst, (neg)tmp */
-	instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-	vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * UCMP(a,b,c) = a ? b : c
- *   sel.b16 dst, b, a, c
- */
-static void
-trans_ucmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a0, *a1, *a2;
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-	a2 = &inst->Src[2].Register;  /* c */
-
-	if (is_rel_or_const(a0))
-		a0 = get_unconst(ctx, a0);
-
-	/* sel.{b32,b16} dst, b, a, c */
-	instr = instr_create(ctx, 3, OPC_SEL_B32);
-	vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
- *   cmps.s.lt tmp_neg, a, 0  # 1 if a is negative
- *   cmps.s.gt tmp_pos, a, 0  # 1 if a is positive
- *   sub.u dst, tmp_pos, tmp_neg
- */
-static void
-trans_issg(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_dst_register neg_dst, pos_dst;
-	struct tgsi_src_register *neg_src, *pos_src;
-
-	neg_src = get_internal_temp(ctx, &neg_dst);
-	pos_src = get_internal_temp(ctx, &pos_dst);
-
-	/* cmps.s.lt neg, a, 0 */
-	instr = instr_create(ctx, 2, OPC_CMPS_S);
-	instr->cat2.condition = IR3_COND_LT;
-	vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
-
-	/* cmps.s.gt pos, a, 0 */
-	instr = instr_create(ctx, 2, OPC_CMPS_S);
-	instr->cat2.condition = IR3_COND_GT;
-	vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
-
-	/* sub.u dst, pos, neg */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
-
-	put_dst(ctx, inst, dst);
-}
-
-
-
-/*
- * Conditional / Flow control
- */
-
-static void
-push_branch(struct ir3_compile_context *ctx, bool inv,
-		struct ir3_instruction *instr, struct ir3_instruction *cond)
-{
-	unsigned int idx = ctx->branch_count++;
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
-	ctx->branch[idx].instr = instr;
-	ctx->branch[idx].inv = inv;
-	/* else side of branch has same condition: */
-	if (!inv)
-		ctx->branch[idx].cond = cond;
-}
-
-static struct ir3_instruction *
-pop_branch(struct ir3_compile_context *ctx)
-{
-	unsigned int idx = --ctx->branch_count;
-	return ctx->branch[idx].instr;
-}
-
-static void
-trans_if(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *cond;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register constval;
-
-	get_immediate(ctx, &constval, fui(0.0));
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	if (is_const(src))
-		src = get_unconst(ctx, src);
-
-	/* cmps.{f,u}.ne tmp0, b, {0.0} */
-	instr = instr_create(ctx, 2, t->opc);
-	add_dst_reg(ctx, instr, &tmp_dst, 0);
-	add_src_reg(ctx, instr, src, src->SwizzleX);
-	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
-	instr->cat2.condition = IR3_COND_NE;
-
-	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
-	cond = instr->regs[1]->instr;
-
-	/* meta:flow tmp0 */
-	instr = instr_create(ctx, -1, OPC_META_FLOW);
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
-
-	push_branch(ctx, false, instr, cond);
-	instr->flow.if_block = push_block(ctx);
-}
-
-static void
-trans_else(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-
-	pop_block(ctx);
-
-	instr = pop_branch(ctx);
-
-	compile_assert(ctx, (instr->category == -1) &&
-			(instr->opc == OPC_META_FLOW));
-
-	push_branch(ctx, true, instr, NULL);
-	instr->flow.else_block = push_block(ctx);
-}
-
-static struct ir3_instruction *
-find_temporary(struct ir3_block *block, unsigned n)
-{
-	if (block->parent && !block->temporaries[n])
-		return find_temporary(block->parent, n);
-	return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-find_output(struct ir3_block *block, unsigned n)
-{
-	if (block->parent && !block->outputs[n])
-		return find_output(block->parent, n);
-	return block->outputs[n];
-}
-
-static struct ir3_instruction *
-create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
-		struct ir3_instruction *a, struct ir3_instruction *b)
-{
-	struct ir3_instruction *phi;
-
-	compile_assert(ctx, cond);
-
-	/* Either side of the condition could be null..  which
-	 * indicates a variable written on only one side of the
-	 * branch.  Normally this should only be variables not
-	 * used outside of that side of the branch.  So we could
-	 * just 'return a ? a : b;' in that case.  But for better
-	 * defined undefined behavior we just stick in imm{0.0}.
-	 * In the common case of a value only used within the
-	 * one side of the branch, the PHI instruction will not
-	 * get scheduled
-	 */
-	if (!a)
-		a = create_immed(ctx, 0.0);
-	if (!b)
-		b = create_immed(ctx, 0.0);
-
-	phi = instr_create(ctx, -1, OPC_META_PHI);
-	ir3_reg_create(phi, 0, 0);  /* dummy dst */
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
-
-	return phi;
-}
-
-static void
-trans_endif(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct ir3_block *ifb, *elseb;
-	struct ir3_instruction **ifout, **elseout;
-	unsigned i, ifnout = 0, elsenout = 0;
-
-	pop_block(ctx);
-
-	instr = pop_branch(ctx);
-
-	compile_assert(ctx, (instr->category == -1) &&
-			(instr->opc == OPC_META_FLOW));
-
-	ifb = instr->flow.if_block;
-	elseb = instr->flow.else_block;
-	/* if there is no else block, the parent block is used for the
-	 * branch-not-taken src of the PHI instructions:
-	 */
-	if (!elseb)
-		elseb = ifb->parent;
-
-	/* worst case sizes: */
-	ifnout = ifb->ntemporaries + ifb->noutputs;
-	elsenout = elseb->ntemporaries + elseb->noutputs;
-
-	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
-	if (elseb != ifb->parent)
-		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
-
-	ifnout = 0;
-	elsenout = 0;
-
-	/* generate PHI instructions for any temporaries written: */
-	for (i = 0; i < ifb->ntemporaries; i++) {
-		struct ir3_instruction *a = ifb->temporaries[i];
-		struct ir3_instruction *b = elseb->temporaries[i];
-
-		/* if temporary written in if-block, or if else block
-		 * is present and temporary written in else-block:
-		 */
-		if (a || ((elseb != ifb->parent) && b)) {
-			struct ir3_instruction *phi;
-
-			/* if only written on one side, find the closest
-			 * enclosing update on other side:
-			 */
-			if (!a)
-				a = find_temporary(ifb, i);
-			if (!b)
-				b = find_temporary(elseb, i);
-
-			ifout[ifnout] = a;
-			a = create_output(ifb, a, ifnout++);
-
-			if (elseb != ifb->parent) {
-				elseout[elsenout] = b;
-				b = create_output(elseb, b, elsenout++);
-			}
-
-			phi = create_phi(ctx, instr, a, b);
-			ctx->block->temporaries[i] = phi;
-		}
-	}
-
-	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
-
-	/* .. and any outputs written: */
-	for (i = 0; i < ifb->noutputs; i++) {
-		struct ir3_instruction *a = ifb->outputs[i];
-		struct ir3_instruction *b = elseb->outputs[i];
-
-		/* if output written in if-block, or if else block
-		 * is present and output written in else-block:
-		 */
-		if (a || ((elseb != ifb->parent) && b)) {
-			struct ir3_instruction *phi;
-
-			/* if only written on one side, find the closest
-			 * enclosing update on other side:
-			 */
-			if (!a)
-				a = find_output(ifb, i);
-			if (!b)
-				b = find_output(elseb, i);
-
-			ifout[ifnout] = a;
-			a = create_output(ifb, a, ifnout++);
-
-			if (elseb != ifb->parent) {
-				elseout[elsenout] = b;
-				b = create_output(elseb, b, elsenout++);
-			}
-
-			phi = create_phi(ctx, instr, a, b);
-			ctx->block->outputs[i] = phi;
-		}
-	}
-
-	ifb->noutputs = ifnout;
-	ifb->outputs = ifout;
-
-	if (elseb != ifb->parent) {
-		elseb->noutputs = elsenout;
-		elseb->outputs = elseout;
-	}
-
-	// TODO maybe we want to compact block->inputs?
-}
-
-/*
- * Kill
- */
-
-static void
-trans_kill(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *immed, *cond = NULL;
-	bool inv = false;
-
-	/* unconditional kill, use enclosing if condition: */
-	if (ctx->branch_count > 0) {
-		unsigned int idx = ctx->branch_count - 1;
-		cond = ctx->branch[idx].cond;
-		inv = ctx->branch[idx].inv;
-	} else {
-		cond = create_immed(ctx, 1.0);
-	}
-
-	compile_assert(ctx, cond);
-
-	immed = create_immed(ctx, 0.0);
-
-	/* cmps.f.ne p0.x, cond, {0.0} */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = IR3_COND_NE;
-	ir3_reg_create(instr, regid(REG_P0, 0), 0);
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-	cond = instr;
-
-	/* kill p0.x */
-	instr = instr_create(ctx, 0, OPC_KILL);
-	instr->cat0.inv = inv;
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-	ctx->kill[ctx->kill_count++] = instr;
-
-	ctx->so->has_kill = true;
-}
-
-/*
- * Kill-If
- */
-
-static void
-trans_killif(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct ir3_instruction *instr, *immed, *cond = NULL;
-	bool inv = false;
-
-	immed = create_immed(ctx, 0.0);
-
-	/* cmps.f.ne p0.x, cond, {0.0} */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = IR3_COND_NE;
-	ir3_reg_create(instr, regid(REG_P0, 0), 0);
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-	add_src_reg(ctx, instr, src, src->SwizzleX);
-
-	cond = instr;
-
-	/* kill p0.x */
-	instr = instr_create(ctx, 0, OPC_KILL);
-	instr->cat0.inv = inv;
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-	ctx->kill[ctx->kill_count++] = instr;
-
-	ctx->so->has_kill = true;
-
-}
-/*
- * I2F / U2F / F2I / F2U
- */
-
-static void
-trans_cov(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-
-	// cov.f32s32 dst, tmp0 /
-	instr = instr_create(ctx, 1, 0);
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_U2F:
-		instr->cat1.src_type = TYPE_U32;
-		instr->cat1.dst_type = TYPE_F32;
-		break;
-	case TGSI_OPCODE_I2F:
-		instr->cat1.src_type = TYPE_S32;
-		instr->cat1.dst_type = TYPE_F32;
-		break;
-	case TGSI_OPCODE_F2U:
-		instr->cat1.src_type = TYPE_F32;
-		instr->cat1.dst_type = TYPE_U32;
-		break;
-	case TGSI_OPCODE_F2I:
-		instr->cat1.src_type = TYPE_F32;
-		instr->cat1.dst_type = TYPE_S32;
-		break;
-
-	}
-	vectorize(ctx, instr, dst, 1, src, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * UMUL / UMAD
- *
- * There is no 32-bit multiply instruction, so splitting a and b into high and
- * low components, we get that
- *
- * dst = al * bl + ah * bl << 16 + al * bh << 16
- *
- *  mull.u tmp0, a, b (mul low, i.e. al * bl)
- *  madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
- *  madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
- *
- * For UMAD, add in the extra argument after mull.u.
- */
-static void
-trans_umul(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_src_register *b = &inst->Src[1].Register;
-
-	struct tgsi_dst_register tmp0_dst, tmp1_dst;
-	struct tgsi_src_register *tmp0_src, *tmp1_src;
-
-	tmp0_src = get_internal_temp(ctx, &tmp0_dst);
-	tmp1_src = get_internal_temp(ctx, &tmp1_dst);
-
-	if (is_rel_or_const(a))
-		a = get_unconst(ctx, a);
-	if (is_rel_or_const(b))
-		b = get_unconst(ctx, b);
-
-	/* mull.u tmp0, a, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
-
-	if (t->tgsi_opc == TGSI_OPCODE_UMAD) {
-		struct tgsi_src_register *c = &inst->Src[2].Register;
-
-		/* add.u tmp0, tmp0, c */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0);
-	}
-
-	/* madsh.m16 tmp1, a, b, tmp0 */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
-
-	/* madsh.m16 dst, b, a, tmp1 */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * IDIV / UDIV / MOD / UMOD
- *
- * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
- * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
- */
-static void
-trans_idiv(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_src_register *b = &inst->Src[1].Register;
-
-	struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
-	struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
-
-	struct tgsi_src_register negative_2, thirty_one;
-	type_t src_type;
-
-	if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
-		src_type = get_stype(ctx);
-	else
-		src_type = get_utype(ctx);
-
-	af_src = get_internal_temp(ctx, &af_dst);
-	bf_src = get_internal_temp(ctx, &bf_dst);
-	q_src = get_internal_temp(ctx, &q_dst);
-	r_src = get_internal_temp(ctx, &r_dst);
-	a_src = get_internal_temp(ctx, &a_dst);
-	b_src = get_internal_temp(ctx, &b_dst);
-
-	get_immediate(ctx, &negative_2, -2);
-	get_immediate(ctx, &thirty_one, 31);
-
-	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
-		premod_dst = &q_dst;
-
-	/* cov.[us]32f32 af, numerator */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &af_dst, 1, a, 0);
-
-	/* cov.[us]32f32 bf, denominator */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &bf_dst, 1, b, 0);
-
-	/* Get the absolute values for IDIV */
-	if (type_sint(src_type)) {
-		/* absneg.f af, (abs)af */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-		vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_FABS);
-
-		/* absneg.f bf, (abs)bf */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-		vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_FABS);
-
-		/* absneg.s a, (abs)numerator */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_SABS);
-
-		/* absneg.s b, (abs)denominator */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_SABS);
-	} else {
-		/* mov.u32u32 a, numerator */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = src_type;
-		instr->cat1.dst_type = src_type;
-		vectorize(ctx, instr, &a_dst, 1, a, 0);
-
-		/* mov.u32u32 b, denominator */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = src_type;
-		instr->cat1.dst_type = src_type;
-		vectorize(ctx, instr, &b_dst, 1, b, 0);
-	}
-
-	/* rcp.f bf, bf */
-	instr = instr_create(ctx, 4, OPC_RCP);
-	vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
-
-	/* That's right, subtract 2 as an integer from the float */
-	/* add.u bf, bf, -2 */
-	instr = instr_create(ctx, 2, OPC_ADD_U);
-	vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
-
-	/* mul.f q, af, bf */
-	instr = instr_create(ctx, 2, OPC_MUL_F);
-	vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
-
-	/* cov.f32[us]32 q, q */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = src_type;
-	vectorize(ctx, instr, &q_dst, 1, q_src, 0);
-
-	/* integer multiply q by b */
-	/* mull.u r, q, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
-
-	/* madsh.m16 r, q, b, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
-
-	/* madsh.m16, r, b, q, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
-
-	/* sub.u r, a, r */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
-
-	/* cov.u32f32, r, r */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_utype(ctx);
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
-
-	/* mul.f r, r, bf */
-	instr = instr_create(ctx, 2, OPC_MUL_F);
-	vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
-
-	/* cov.f32u32 r, r */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = get_utype(ctx);
-	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
-
-	/* add.u q, q, r */
-	instr = instr_create(ctx, 2, OPC_ADD_U);
-	vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
-
-	/* mull.u r, q, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
-
-	/* madsh.m16 r, q, b, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
-
-	/* madsh.m16 r, b, q, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
-
-	/* sub.u r, a, r */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
-
-	/* cmps.u.ge r, r, b */
-	instr = instr_create(ctx, 2, OPC_CMPS_U);
-	instr->cat2.condition = IR3_COND_GE;
-	vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
-
-	if (type_uint(src_type)) {
-		/* add.u dst, q, r */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
-	} else {
-		/* add.u q, q, r */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
-
-		/* negate result based on the original arguments */
-		if (is_const(a) && is_const(b))
-			a = get_unconst(ctx, a);
-
-		/* xor.b r, numerator, denominator */
-		instr = instr_create(ctx, 2, OPC_XOR_B);
-		vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
-
-		/* shr.b r, r, 31 */
-		instr = instr_create(ctx, 2, OPC_SHR_B);
-		vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
-
-		/* absneg.s b, (neg)q */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_SNEG);
-
-		/* sel.b dst, b, r, q */
-		instr = instr_create(ctx, 3, OPC_SEL_B32);
-		vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
-	}
-
-	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
-		/* The division result will have ended up in q. */
-
-		if (is_rel_or_const(b))
-			b = get_unconst(ctx, b);
-
-		/* mull.u r, q, b */
-		instr = instr_create(ctx, 2, OPC_MULL_U);
-		vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
-
-		/* madsh.m16 r, q, b, r */
-		instr = instr_create(ctx, 3, OPC_MADSH_M16);
-		vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
-
-		/* madsh.m16 r, b, q, r */
-		instr = instr_create(ctx, 3, OPC_MADSH_M16);
-		vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
-
-		/* sub.u dst, a, r */
-		instr = instr_create(ctx, 2, OPC_SUB_U);
-		vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * Handlers for TGSI instructions which do have 1:1 mapping to native
- * instructions:
- */
-
-static void
-instr_cat0(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
+struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id)
 {
-	instr_create(ctx, 0, t->opc);
+	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+	compiler->gpu_id = gpu_id;
+	compiler->set = ir3_ra_alloc_reg_set(compiler);
+	return compiler;
 }
 
-static void
-instr_cat1(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
+void ir3_compiler_destroy(struct ir3_compiler *compiler)
 {
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-
-	/* NOTE: atomic start/end, rather than in create_mov() since
-	 * create_mov() is used already w/in atomic sequences (and
-	 * we aren't clever enough to deal with the nesting)
-	 */
-	instr_atomic_start(ctx);
-	create_mov(ctx, dst, src);
-	instr_atomic_end(ctx);
-}
-
-static void
-instr_cat2(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct ir3_instruction *instr;
-	unsigned src0_flags = 0, src1_flags = 0;
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_ABS:
-		src0_flags = IR3_REG_FABS;
-		break;
-	case TGSI_OPCODE_IABS:
-		src0_flags = IR3_REG_SABS;
-		break;
-	case TGSI_OPCODE_INEG:
-		src0_flags = IR3_REG_SNEG;
-		break;
-	case TGSI_OPCODE_SUB:
-		src1_flags = IR3_REG_FNEG;
-		break;
-	}
-
-	switch (t->opc) {
-	case OPC_ABSNEG_F:
-	case OPC_ABSNEG_S:
-	case OPC_CLZ_B:
-	case OPC_CLZ_S:
-	case OPC_SIGN_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-	case OPC_NOT_B:
-	case OPC_BFREV_B:
-	case OPC_SETRM:
-	case OPC_CBITS_B:
-		/* these only have one src reg */
-		instr = instr_create(ctx, 2, t->opc);
-		vectorize(ctx, instr, dst, 1, src0, src0_flags);
-		break;
-	default:
-		if (is_const(src0) && is_const(src1))
-			src0 = get_unconst(ctx, src0);
-
-		instr = instr_create(ctx, 2, t->opc);
-		vectorize(ctx, instr, dst, 2, src0, src0_flags,
-				src1, src1_flags);
-		break;
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat3(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct ir3_instruction *instr;
-
-	/* in particular, can't handle const for src1 for cat3..
-	 * for mad, we can swap first two src's if needed:
-	 */
-	if (is_rel_or_const(src1)) {
-		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
-			struct tgsi_src_register *tmp;
-			tmp = src0;
-			src0 = src1;
-			src1 = tmp;
-		} else {
-			src1 = get_unconst(ctx, src1);
-		}
-	}
-
-	instr = instr_create(ctx, 3, t->opc);
-	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
-			&inst->Src[2].Register, 0);
-	put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat4(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct ir3_instruction *instr;
-	unsigned i;
-
-	/* seems like blob compiler avoids const as src.. */
-	if (is_const(src))
-		src = get_unconst(ctx, src);
-
-	/* we need to replicate into each component: */
-	for (i = 0; i < 4; i++) {
-		if (dst->WriteMask & (1 << i)) {
-			instr = instr_create(ctx, 4, t->opc);
-			add_dst_reg(ctx, instr, dst, i);
-			add_src_reg(ctx, instr, src, src->SwizzleX);
-		}
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
-#define INSTR(n, f, ...) \
-	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
-
-	INSTR(MOV,          instr_cat1),
-	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
-	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
-	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
-	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
-	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
-	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
-	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
-	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
-	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
-	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
-	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
-	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
-	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
-	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
-	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
-	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
-	INSTR(UMUL,         trans_umul),
-	INSTR(UMAD,         trans_umul),
-	INSTR(UDIV,         trans_idiv),
-	INSTR(IDIV,         trans_idiv),
-	INSTR(MOD,          trans_idiv),
-	INSTR(UMOD,         trans_idiv),
-	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
-	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
-	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
-	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
-	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
-	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
-	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
-	INSTR(CLAMP,        trans_clamp),
-	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
-	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
-	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
-	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
-	INSTR(ARL,          trans_arl),
-	INSTR(UARL,         trans_arl),
-	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
-	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
-	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
-	INSTR(COS,          instr_cat4, .opc = OPC_COS),
-	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
-	INSTR(TEX,          trans_samp, .opc = OPC_SAM),
-	INSTR(TXP,          trans_samp, .opc = OPC_SAM),
-	INSTR(TXB,          trans_samp, .opc = OPC_SAMB),
-	INSTR(TXB2,         trans_samp, .opc = OPC_SAMB),
-	INSTR(TXL,          trans_samp, .opc = OPC_SAML),
-	INSTR(TXD,          trans_samp, .opc = OPC_SAMGQ),
-	INSTR(TXF,          trans_samp, .opc = OPC_ISAML),
-	INSTR(TXQ,          trans_txq),
-	INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
-	INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
-	INSTR(SGT,          trans_cmp),
-	INSTR(SLT,          trans_cmp),
-	INSTR(FSLT,         trans_cmp),
-	INSTR(SGE,          trans_cmp),
-	INSTR(FSGE,         trans_cmp),
-	INSTR(SLE,          trans_cmp),
-	INSTR(SNE,          trans_cmp),
-	INSTR(FSNE,         trans_cmp),
-	INSTR(SEQ,          trans_cmp),
-	INSTR(FSEQ,         trans_cmp),
-	INSTR(CMP,          trans_cmp),
-	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
-	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
-	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(UCMP,         trans_ucmp),
-	INSTR(ISSG,         trans_issg),
-	INSTR(IF,           trans_if,   .opc = OPC_CMPS_F),
-	INSTR(UIF,          trans_if,   .opc = OPC_CMPS_U),
-	INSTR(ELSE,         trans_else),
-	INSTR(ENDIF,        trans_endif),
-	INSTR(END,          instr_cat0, .opc = OPC_END),
-	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
-	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
-	INSTR(I2F,          trans_cov),
-	INSTR(U2F,          trans_cov),
-	INSTR(F2I,          trans_cov),
-	INSTR(F2U,          trans_cov),
-};
-
-static ir3_semantic
-decl_semantic(const struct tgsi_declaration_semantic *sem)
-{
-	return ir3_semantic_name(sem->Name, sem->Index);
-}
-
-static struct ir3_instruction *
-decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j, unsigned inloc, bool use_ldlv)
-{
-	struct ir3_instruction *instr;
-	struct ir3_register *src;
-
-	if (use_ldlv) {
-		/* ldlv.u32 dst, l[#inloc], 1 */
-		instr = instr_create(ctx, 6, OPC_LDLV);
-		instr->cat6.type = TYPE_U32;
-		instr->cat6.iim_val = 1;
-		ir3_reg_create(instr, regid, 0);   /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-
-		return instr;
-	}
-
-	/* bary.f dst, #inloc, r0.x */
-	instr = instr_create(ctx, 2, OPC_BARY_F);
-	ir3_reg_create(instr, regid, 0);   /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
-	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
-	src->wrmask = 0x3;
-	src->instr = ctx->frag_pos;
-
-	return instr;
-}
-
-/* TGSI_SEMANTIC_POSITION
- * """"""""""""""""""""""
- *
- * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
- * fragment shader input contains the fragment's window position.  The X
- * component starts at zero and always increases from left to right.
- * The Y component starts at zero and always increases but Y=0 may either
- * indicate the top of the window or the bottom depending on the fragment
- * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
- * The Z coordinate ranges from 0 to 1 to represent depth from the front
- * to the back of the Z buffer.  The W component contains the reciprocol
- * of the interpolated vertex position W component.
- */
-static struct ir3_instruction *
-decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j)
-{
-	struct ir3_instruction *instr, *src;
-
-	compile_assert(ctx, !ctx->frag_coord[j]);
-
-	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
-
-
-	switch (j) {
-	case 0: /* .x */
-	case 1: /* .y */
-		/* for frag_coord, we get unsigned values.. we need
-		 * to subtract (integer) 8 and divide by 16 (right-
-		 * shift by 4) then convert to float:
-		 */
-
-		/* add.s tmp, src, -8 */
-		instr = instr_create(ctx, 2, OPC_ADD_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
-		src = instr;
-
-		/* shr.b tmp, tmp, 4 */
-		instr = instr_create(ctx, 2, OPC_SHR_B);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
-		src = instr;
-
-		/* mov.u32f32 dst, tmp */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = TYPE_U32;
-		instr->cat1.dst_type = TYPE_F32;
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-		break;
-	case 2: /* .z */
-	case 3: /* .w */
-		/* seems that we can use these as-is: */
-		instr = ctx->frag_coord[j];
-		break;
-	default:
-		compile_error(ctx, "invalid channel\n");
-		instr = create_immed(ctx, 0.0);
-		break;
-	}
-
-	return instr;
-}
-
-/* TGSI_SEMANTIC_FACE
- * """"""""""""""""""
- *
- * This label applies to fragment shader inputs only and indicates that
- * the register contains front/back-face information of the form (F, 0,
- * 0, 1).  The first component will be positive when the fragment belongs
- * to a front-facing polygon, and negative when the fragment belongs to a
- * back-facing polygon.
- */
-static struct ir3_instruction *
-decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j)
-{
-	struct ir3_instruction *instr, *src;
-
-	switch (j) {
-	case 0: /* .x */
-		compile_assert(ctx, !ctx->frag_face);
-
-		ctx->frag_face = create_input(ctx->block, NULL, 0);
-
-		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
-		 * positive vs negative float.. and piglit further seems to
-		 * expect -1.0 or 1.0:
-		 *
-		 *    mul.s tmp, hr0.x, 2
-		 *    add.s tmp, tmp, 1
-		 *    mov.s16f32, dst, tmp
-		 *
-		 */
-
-		instr = instr_create(ctx, 2, OPC_MUL_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-		src = instr;
-
-		instr = instr_create(ctx, 2, OPC_ADD_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-		src = instr;
-
-		instr = instr_create(ctx, 1, 0); /* mov */
-		instr->cat1.src_type = TYPE_S32;
-		instr->cat1.dst_type = TYPE_F32;
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-		break;
-	case 1: /* .y */
-	case 2: /* .z */
-		instr = create_immed(ctx, 0.0);
-		break;
-	case 3: /* .w */
-		instr = create_immed(ctx, 1.0);
-		break;
-	default:
-		compile_error(ctx, "invalid channel\n");
-		instr = create_immed(ctx, 0.0);
-		break;
-	}
-
-	return instr;
-}
-
-static void
-decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned name = decl->Semantic.Name;
-	unsigned i;
-
-	/* I don't think we should get frag shader input without
-	 * semantic info?  Otherwise how do inputs get linked to
-	 * vert outputs?
-	 */
-	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
-			decl->Declaration.Semantic);
-
-	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-		unsigned n = so->inputs_count++;
-		unsigned r = regid(i, 0);
-		unsigned ncomp, j;
-
-		/* we'll figure out the actual components used after scheduling */
-		ncomp = 4;
-
-		DBG("decl in -> r%d", i);
-
-		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-
-		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-		so->inputs[n].compmask = (1 << ncomp) - 1;
-		so->inputs[n].regid = r;
-		so->inputs[n].inloc = ctx->next_inloc;
-		so->inputs[n].interpolate = decl->Interp.Interpolate;
-
-		for (j = 0; j < ncomp; j++) {
-			struct ir3_instruction *instr = NULL;
-
-			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-				/* for fragment shaders, POSITION and FACE are handled
-				 * specially, not using normal varying / bary.f
-				 */
-				if (name == TGSI_SEMANTIC_POSITION) {
-					so->inputs[n].bary = false;
-					so->frag_coord = true;
-					instr = decl_in_frag_coord(ctx, r + j, j);
-				} else if (name == TGSI_SEMANTIC_FACE) {
-					so->inputs[n].bary = false;
-					so->frag_face = true;
-					instr = decl_in_frag_face(ctx, r + j, j);
-				} else {
-					bool use_ldlv = false;
-
-					/* if no interpolation given, pick based on
-					 * semantic:
-					 */
-					if (!decl->Declaration.Interpolate) {
-						switch (decl->Semantic.Name) {
-						case TGSI_SEMANTIC_COLOR:
-							so->inputs[n].interpolate =
-									TGSI_INTERPOLATE_COLOR;
-							break;
-						default:
-							so->inputs[n].interpolate =
-									TGSI_INTERPOLATE_LINEAR;
-						}
-					}
-
-					if (ctx->flat_bypass) {
-						switch (so->inputs[n].interpolate) {
-						case TGSI_INTERPOLATE_COLOR:
-							if (!ctx->so->key.rasterflat)
-								break;
-							/* fallthrough */
-						case TGSI_INTERPOLATE_CONSTANT:
-							use_ldlv = true;
-							break;
-						}
-					}
-
-					so->inputs[n].bary = true;
-
-					instr = decl_in_frag_bary(ctx, r + j, j,
-							so->inputs[n].inloc + j - 8, use_ldlv);
-				}
-			} else {
-				instr = create_input(ctx->block, NULL, (i * 4) + j);
-			}
-
-			ctx->block->inputs[(i * 4) + j] = instr;
-		}
-
-		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
-			ctx->next_inloc += ncomp;
-			so->total_in += ncomp;
-		}
-	}
-}
-
-static void
-decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned r = regid(so->inputs_count, 0);
-	unsigned n = so->inputs_count++;
-
-	DBG("decl sv -> r%d", n);
-
-	compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-	compile_assert(ctx, decl->Range.First < ARRAY_SIZE(ctx->sysval_semantics));
-
-	ctx->sysval_semantics[decl->Range.First] = decl->Semantic.Name;
-	so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-	so->inputs[n].compmask = 1;
-	so->inputs[n].regid = r;
-	so->inputs[n].inloc = ctx->next_inloc;
-	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
-
-	struct ir3_instruction *instr = NULL;
-
-	switch (decl->Semantic.Name) {
-	case TGSI_SEMANTIC_VERTEXID_NOBASE:
-		ctx->vertex_id = instr = create_input(ctx->block, NULL, r);
-		break;
-	case TGSI_SEMANTIC_BASEVERTEX:
-		ctx->basevertex = instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = get_stype(ctx);
-		instr->cat1.dst_type = get_stype(ctx);
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, regid(so->first_driver_param + 4, 0),
-					   IR3_REG_CONST);
-		break;
-	case TGSI_SEMANTIC_INSTANCEID:
-		ctx->instance_id = instr = create_input(ctx->block, NULL, r);
-		break;
-	default:
-		compile_error(ctx, "Unknown semantic: %s\n",
-					  tgsi_semantic_names[decl->Semantic.Name]);
-	}
-
-	ctx->block->inputs[r] = instr;
-	ctx->next_inloc++;
-	so->total_in++;
-}
-
-static void
-decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned comp = 0;
-	unsigned name = decl->Semantic.Name;
-	unsigned i;
-
-	compile_assert(ctx, decl->Declaration.Semantic);
-
-	DBG("decl out[%d] -> r%d", name, decl->Range.First);
-
-	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-		switch (name) {
-		case TGSI_SEMANTIC_POSITION:
-			so->writes_pos = true;
-			break;
-		case TGSI_SEMANTIC_PSIZE:
-			so->writes_psize = true;
-			break;
-		case TGSI_SEMANTIC_COLOR:
-		case TGSI_SEMANTIC_BCOLOR:
-		case TGSI_SEMANTIC_GENERIC:
-		case TGSI_SEMANTIC_FOG:
-		case TGSI_SEMANTIC_TEXCOORD:
-			break;
-		default:
-			compile_error(ctx, "unknown VS semantic name: %s\n",
-					tgsi_semantic_names[name]);
-		}
-	} else {
-		switch (name) {
-		case TGSI_SEMANTIC_POSITION:
-			comp = 2;  /* tgsi will write to .z component */
-			so->writes_pos = true;
-			break;
-		case TGSI_SEMANTIC_COLOR:
-			break;
-		default:
-			compile_error(ctx, "unknown FS semantic name: %s\n",
-					tgsi_semantic_names[name]);
-		}
-	}
-
-	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-		unsigned n = so->outputs_count++;
-		unsigned ncomp, j;
-
-		ncomp = 4;
-
-		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-
-		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
-		so->outputs[n].regid = regid(i, comp);
-
-		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
-		 * which if the output is actually assigned will be over-
-		 * written
-		 */
-		for (j = 0; j < ncomp; j++)
-			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
-	}
-}
-
-/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
- * for a fragment shader are just bary.f instructions.  The *actual* inputs
- * from the hw perspective are the frag_pos and optionally frag_coord and
- * frag_face.
- */
-static void
-fixup_frag_inputs(struct ir3_compile_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction *instr;
-	int n, regid = 0;
-
-	block->ninputs = 0;
-
-	n  = 4;  /* always have frag_pos */
-	n += COND(so->frag_face, 4);
-	n += COND(so->frag_coord, 4);
-
-	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
-
-	if (so->frag_face) {
-		/* this ultimately gets assigned to hr0.x so doesn't conflict
-		 * with frag_coord/frag_pos..
-		 */
-		inputs[block->ninputs++] = ctx->frag_face;
-		ctx->frag_face->regs[0]->num = 0;
-
-		/* remaining channels not used, but let's avoid confusing
-		 * other parts that expect inputs to come in groups of vec4
-		 */
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-	}
-
-	/* since we don't know where to set the regid for frag_coord,
-	 * we have to use r0.x for it.  But we don't want to *always*
-	 * use r1.x for frag_pos as that could increase the register
-	 * footprint on simple shaders:
-	 */
-	if (so->frag_coord) {
-		ctx->frag_coord[0]->regs[0]->num = regid++;
-		ctx->frag_coord[1]->regs[0]->num = regid++;
-		ctx->frag_coord[2]->regs[0]->num = regid++;
-		ctx->frag_coord[3]->regs[0]->num = regid++;
-
-		inputs[block->ninputs++] = ctx->frag_coord[0];
-		inputs[block->ninputs++] = ctx->frag_coord[1];
-		inputs[block->ninputs++] = ctx->frag_coord[2];
-		inputs[block->ninputs++] = ctx->frag_coord[3];
-	}
-
-	/* we always have frag_pos: */
-	so->pos_regid = regid;
-
-	/* r0.x */
-	instr = create_input(block, NULL, block->ninputs);
-	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
-	ctx->frag_pos->regs[1]->instr = instr;
-
-	/* r0.y */
-	instr = create_input(block, NULL, block->ninputs);
-	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
-	ctx->frag_pos->regs[2]->instr = instr;
-
-	block->inputs = inputs;
-}
-
-static void
-compile_instructions(struct ir3_compile_context *ctx)
-{
-	push_block(ctx);
-
-	/* for fragment shader, we have a single input register (usually
-	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
-	 */
-	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-		struct ir3_instruction *instr;
-		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
-		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
-		ctx->frag_pos = instr;
-	}
-
-	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-		tgsi_parse_token(&ctx->parser);
-
-		switch (ctx->parser.FullToken.Token.Type) {
-		case TGSI_TOKEN_TYPE_DECLARATION: {
-			struct tgsi_full_declaration *decl =
-					&ctx->parser.FullToken.FullDeclaration;
-			unsigned file = decl->Declaration.File;
-			if (file == TGSI_FILE_OUTPUT) {
-				decl_out(ctx, decl);
-			} else if (file == TGSI_FILE_INPUT) {
-				decl_in(ctx, decl);
-			} else if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
-				decl_sv(ctx, decl);
-			}
-
-			if ((file != TGSI_FILE_CONSTANT) && decl->Declaration.Array) {
-				int aid = decl->Array.ArrayID + ctx->array_offsets[file];
-
-				compile_assert(ctx, aid < ARRAY_SIZE(ctx->array));
-
-				/* legacy ArrayID==0 stuff probably isn't going to work
-				 * well (and is at least untested).. let's just scream:
-				 */
-				compile_assert(ctx, aid != 0);
-
-				ctx->array[aid].first = decl->Range.First;
-				ctx->array[aid].last  = decl->Range.Last;
-			}
-			break;
-		}
-		case TGSI_TOKEN_TYPE_IMMEDIATE: {
-			/* TODO: if we know the immediate is small enough, and only
-			 * used with instructions that can embed an immediate, we
-			 * can skip this:
-			 */
-			struct tgsi_full_immediate *imm =
-					&ctx->parser.FullToken.FullImmediate;
-			unsigned n = ctx->so->immediates_count++;
-			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
-			memcpy(ctx->so->immediates[n].val, imm->u, 16);
-			break;
-		}
-		case TGSI_TOKEN_TYPE_INSTRUCTION: {
-			struct tgsi_full_instruction *inst =
-					&ctx->parser.FullToken.FullInstruction;
-			unsigned opc = inst->Instruction.Opcode;
-			const struct instr_translater *t = &translaters[opc];
-
-			if (t->fxn) {
-				t->fxn(t, ctx, inst);
-				ctx->num_internal_temps = 0;
-
-				compile_assert(ctx, !ctx->using_tmp_dst);
-			} else {
-				compile_error(ctx, "unknown TGSI opc: %s\n",
-						tgsi_get_opcode_name(opc));
-			}
-
-			switch (inst->Instruction.Saturate) {
-			case TGSI_SAT_ZERO_ONE:
-				create_clamp_imm(ctx, &inst->Dst[0].Register,
-						fui(0.0), fui(1.0));
-				break;
-			case TGSI_SAT_MINUS_PLUS_ONE:
-				create_clamp_imm(ctx, &inst->Dst[0].Register,
-						fui(-1.0), fui(1.0));
-				break;
-			}
-
-			instr_finish(ctx);
-
-			break;
-		}
-		case TGSI_TOKEN_TYPE_PROPERTY: {
-			struct tgsi_full_property *prop =
-				&ctx->parser.FullToken.FullProperty;
-			switch (prop->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				ctx->so->color0_mrt = !!prop->u[0].Data;
-				break;
-			}
-		}
-		default:
-			break;
-		}
-	}
-}
-
-static void
-compile_dump(struct ir3_compile_context *ctx)
-{
-	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
-	static unsigned n = 0;
-	char fname[16];
-	FILE *f;
-	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
-	f = fopen(fname, "w");
-	if (!f)
-		return;
-	ir3_block_depth(ctx->block);
-	ir3_dump(ctx->ir, name, ctx->block, f);
-	fclose(f);
-}
-
-int
-ir3_compile_shader(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key,
-		bool cp)
-{
-	struct ir3_compile_context ctx;
-	struct ir3_block *block;
-	struct ir3_instruction **inputs;
-	unsigned i, j, actual_in;
-	int ret = 0, max_bary;
-
-	assert(!so->ir);
-
-	so->ir = ir3_create();
-
-	assert(so->ir);
-
-	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
-		DBG("INIT failed!");
-		ret = -1;
-		goto out;
-	}
-
-	/* for now, until the edge cases are worked out: */
-	if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-		cp = false;
-
-	compile_instructions(&ctx);
-
-	block = ctx.block;
-	so->ir->block = block;
-
-	/* keep track of the inputs from TGSI perspective.. */
-	inputs = block->inputs;
-
-	/* but fixup actual inputs for frag shader: */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
-		fixup_frag_inputs(&ctx);
-
-	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (key.binning_pass) {
-		for (i = 0, j = 0; i < so->outputs_count; i++) {
-			unsigned name = sem2name(so->outputs[i].semantic);
-			unsigned idx = sem2idx(so->outputs[i].semantic);
-
-			/* throw away everything but first position/psize */
-			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
-					(name == TGSI_SEMANTIC_PSIZE))) {
-				if (i != j) {
-					so->outputs[j] = so->outputs[i];
-					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
-					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
-					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
-					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
-				}
-				j++;
-			}
-		}
-		so->outputs_count = j;
-		block->noutputs = j * 4;
-	}
-
-	/* if we want half-precision outputs, mark the output registers
-	 * as half:
-	 */
-	if (key.half_precision) {
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
-				continue;
-			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
-		}
-	}
-
-	/* at this point, we want the kill's in the outputs array too,
-	 * so that they get scheduled (since they have no dst).. we've
-	 * already ensured that the array is big enough in push_block():
-	 */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
-		for (i = 0; i < ctx.kill_count; i++)
-			block->outputs[block->noutputs++] = ctx.kill[i];
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(&ctx);
-
-	ret = ir3_block_flatten(block);
-	if (ret < 0) {
-		DBG("FLATTEN failed!");
-		goto out;
-	}
-	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
-		compile_dump(&ctx);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("BEFORE CP:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ir3_block_depth(block);
-
-	/* First remove all the extra mov's (which we could skip if the
-	 * front-end was clever enough not to insert them in the first
-	 * place).  Then figure out left/right neighbors, re-inserting
-	 * extra mov's when needed to avoid conflicts.
-	 */
-	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
-		ir3_block_cp(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("BEFORE GROUPING:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	/* Group left/right neighbors, inserting mov's where needed to
-	 * solve conflicts:
-	 */
-	ir3_block_group(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(&ctx);
-
-	ir3_block_depth(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER DEPTH:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ret = ir3_block_sched(block);
-	if (ret) {
-		DBG("SCHED failed!");
-		goto out;
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER SCHED:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
-	if (ret) {
-		DBG("RA failed!");
-		goto out;
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER RA:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
-
-	/* fixup input/outputs: */
-	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
-		/* preserve hack for depth output.. tgsi writes depth to .z,
-		 * but what we give the hw is the scalar register:
-		 */
-		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
-			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
-			so->outputs[i].regid += 2;
-	}
-	/* Note that some or all channels of an input may be unused: */
-	actual_in = 0;
-	for (i = 0; i < so->inputs_count; i++) {
-		unsigned j, regid = ~0, compmask = 0;
-		so->inputs[i].ncomp = 0;
-		for (j = 0; j < 4; j++) {
-			struct ir3_instruction *in = inputs[(i*4) + j];
-			if (in) {
-				compmask |= (1 << j);
-				regid = in->regs[0]->num - j;
-				actual_in++;
-				so->inputs[i].ncomp++;
-			}
-		}
-		so->inputs[i].regid = regid;
-		so->inputs[i].compmask = compmask;
-	}
-
-	/* fragment shader always gets full vec4's even if it doesn't
-	 * fetch all components, but vertex shader we need to update
-	 * with the actual number of components fetch, otherwise thing
-	 * will hang due to mismaptch between VFD_DECODE's and
-	 * TOTALATTRTOVS
-	 */
-	if (so->type == SHADER_VERTEX)
-		so->total_in = actual_in;
-	else
-		so->total_in = align(max_bary + 1, 4);
-
-out:
-	if (ret) {
-		ir3_destroy(so->ir);
-		so->ir = NULL;
-	}
-	compile_free(&ctx);
-
-	return ret;
+	ralloc_free(compiler);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 9213386e00c..86b1161d9cb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -31,12 +31,19 @@
 
 #include "ir3_shader.h"
 
+struct ir3_ra_reg_set;
 
-int ir3_compile_shader_nir(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key);
+struct ir3_compiler {
+	uint32_t gpu_id;
+	struct ir3_ra_reg_set *set;
+};
 
-int ir3_compile_shader(struct ir3_shader_variant *so,
+struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
+void ir3_compiler_destroy(struct ir3_compiler *compiler);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens,
-		struct ir3_shader_key key, bool cp);
+		struct ir3_shader_key key);
 
 #endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 05e7049ad55..48b1d8f3606 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -48,19 +48,19 @@
 #include "ir3.h"
 
 
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-
 struct ir3_compile {
+	struct ir3_compiler *compiler;
+
 	const struct tgsi_token *tokens;
 	struct nir_shader *s;
 
 	struct ir3 *ir;
 	struct ir3_shader_variant *so;
 
-	/* bitmask of which samplers are integer: */
-	uint16_t integer_s;
+	struct ir3_block *block;      /* the current block */
+	struct ir3_block *in_block;   /* block created for shader inputs */
 
-	struct ir3_block *block;
+	nir_function_impl *impl;
 
 	/* For fragment shaders, from the hw perspective the only
 	 * actual input is r0.xy position register passed to bary.f.
@@ -92,6 +92,11 @@ struct ir3_compile {
 	 */
 	struct hash_table *addr_ht;
 
+	/* maps nir_block to ir3_block, mostly for the purposes of
+	 * figuring out the blocks successors
+	 */
+	struct hash_table *block_ht;
+
 	/* for calculating input/output positions/linkages: */
 	unsigned next_inloc;
 
@@ -104,6 +109,11 @@ struct ir3_compile {
 	 */
 	bool levels_add_one;
 
+	/* on a3xx, we need to scale up integer coords for isaml based
+	 * on LoD:
+	 */
+	bool unminify_coords;
+
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
@@ -118,6 +128,9 @@ struct ir3_compile {
 };
 
 
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
+
 static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 {
 	struct nir_shader_compiler_options options = {
@@ -146,6 +159,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 
 		nir_lower_vars_to_ssa(s);
 		nir_lower_alu_to_scalar(s);
+		nir_lower_phis_to_scalar(s);
 
 		progress |= nir_copy_prop(s);
 		progress |= nir_opt_dce(s);
@@ -170,7 +184,8 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 
 /* TODO nir doesn't lower everything for us yet, but ideally it would: */
 static const struct tgsi_token *
-lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
+lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens,
+		struct ir3_shader_variant *so)
 {
 	struct tgsi_shader_info info;
 	struct tgsi_lowering_config lconfig = {
@@ -192,11 +207,7 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
 		break;
 	}
 
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
+	if (ctx->compiler->gpu_id >= 400) {
 		/* a4xx seems to have *no* sam.p */
 		lconfig.lower_TXP = ~0;  /* lower all txp */
 	} else {
@@ -208,36 +219,26 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
 }
 
 static struct ir3_compile *
-compile_init(struct ir3_shader_variant *so,
+compile_init(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens)
 {
 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 	const struct tgsi_token *lowered_tokens;
 
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
+	if (compiler->gpu_id >= 400) {
 		/* need special handling for "flat" */
 		ctx->flat_bypass = true;
 		ctx->levels_add_one = false;
+		ctx->unminify_coords = false;
 	} else {
 		/* no special handling for "flat" */
 		ctx->flat_bypass = false;
 		ctx->levels_add_one = true;
+		ctx->unminify_coords = true;
 	}
 
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		ctx->integer_s = so->key.finteger_s;
-		break;
-	case SHADER_VERTEX:
-		ctx->integer_s = so->key.vinteger_s;
-		break;
-	}
-
+	ctx->compiler = compiler;
 	ctx->ir = so->ir;
 	ctx->so = so;
 	ctx->next_inloc = 8;
@@ -247,8 +248,10 @@ compile_init(struct ir3_shader_variant *so,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->addr_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->block_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
-	lowered_tokens = lower_tgsi(tokens, so);
+	lowered_tokens = lower_tgsi(ctx, tokens, so);
 	if (!lowered_tokens)
 		lowered_tokens = tokens;
 	ctx->s = to_nir(lowered_tokens);
@@ -290,33 +293,206 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }
 
-
+/* global per-array information: */
 struct ir3_array {
 	unsigned length, aid;
+};
+
+/* per-block array state: */
+struct ir3_array_value {
+	/* TODO drop length/aid, and just have ptr back to ir3_array */
+	unsigned length, aid;
+	/* initial array element values are phi's, other than for the
+	 * entry block.  The phi src's get added later in a resolve step
+	 * after we have visited all the blocks, to account for back
+	 * edges in the cfg.
+	 */
+	struct ir3_instruction **phis;
+	/* current array element values (as block is processed).  When
+	 * the array phi's are resolved, it will contain the array state
+	 * at exit of block, so successor blocks can use it to add their
+	 * phi srcs.
+	 */
 	struct ir3_instruction *arr[];
 };
 
+/* track array assignments per basic block.  When an array is read
+ * outside of the same basic block, we can use NIR's dominance-frontier
+ * information to figure out where phi nodes are needed.
+ */
+struct ir3_nir_block_data {
+	unsigned foo;
+	/* indexed by array-id (aid): */
+	struct ir3_array_value *arrs[];
+};
+
+static struct ir3_nir_block_data *
+get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	if (!block->bd) {
+		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
+				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
+		block->bd = bd;
+	}
+	return block->bd;
+}
+
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
-	struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
-			(length * sizeof(arr->arr[0])));
+	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
 	arr->length = length;
 	arr->aid = ++ctx->num_arrays;
-	/* Some shaders end up reading array elements without first writing..
-	 * so initialize things to prevent null instr ptrs later:
-	 */
-	for (unsigned i = 0; i < length; i++)
-		arr->arr[i] = create_immed(ctx->block, 0);
 	_mesa_hash_table_insert(ctx->var_ht, var, arr);
 }
 
-static struct ir3_array *
+static nir_block *
+nir_block_pred(nir_block *block)
+{
+	assert(block->predecessors->entries < 2);
+	if (block->predecessors->entries == 0)
+		return NULL;
+	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+}
+
+static struct ir3_array_value *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	return entry->data;
+	struct ir3_block *block = ctx->block;
+	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
+	struct ir3_array *arr = entry->data;
+
+	if (!bd->arrs[arr->aid]) {
+		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
+				(arr->length * sizeof(av->arr[0])));
+		struct ir3_array_value *defn = NULL;
+		nir_block *pred_block;
+
+		av->length = arr->length;
+		av->aid = arr->aid;
+
+		/* For loops, we have to consider that we have not visited some
+		 * of the blocks who should feed into the phi (ie. back-edges in
+		 * the cfg).. for example:
+		 *
+		 *   loop {
+		 *      block { load_var; ... }
+		 *      if then block {} else block {}
+		 *      block { store_var; ... }
+		 *      if then block {} else block {}
+		 *      block {...}
+		 *   }
+		 *
+		 * We can skip the phi if we can chase the block predecessors
+		 * until finding the block previously defining the array without
+		 * crossing a block that has more than one predecessor.
+		 *
+		 * Otherwise create phi's and resolve them as a post-pass after
+		 * all the blocks have been visited (to handle back-edges).
+		 */
+
+		for (pred_block = block->nblock;
+				pred_block && (pred_block->predecessors->entries < 2) && !defn;
+				pred_block = nir_block_pred(pred_block)) {
+			struct ir3_block *pblock = get_block(ctx, pred_block);
+			struct ir3_nir_block_data *pbd = pblock->bd;
+			if (!pbd)
+				continue;
+			defn = pbd->arrs[arr->aid];
+		}
+
+		if (defn) {
+			/* only one possible definer: */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = defn->arr[i];
+		} else if (pred_block) {
+			/* not the first block, and multiple potential definers: */
+			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
+
+			for (unsigned i = 0; i < arr->length; i++) {
+				struct ir3_instruction *phi;
+
+				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
+						1 + ctx->impl->num_blocks);
+				ir3_reg_create(phi, 0, 0);         /* dst */
+
+				/* phi's should go at head of block: */
+				list_delinit(&phi->node);
+				list_add(&phi->node, &block->instr_list);
+
+				av->phis[i] = av->arr[i] = phi;
+			}
+		} else {
+			/* Some shaders end up reading array elements without
+			 * first writing.. so initialize things to prevent null
+			 * instr ptrs later:
+			 */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = create_immed(block, 0);
+		}
+
+		bd->arrs[arr->aid] = av;
+	}
+
+	return bd->arrs[arr->aid];
+}
+
+static void
+add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
+		struct ir3_array_value *av, BITSET_WORD *visited)
+{
+	struct ir3_block *block;
+	struct ir3_nir_block_data *bd;
+
+	if (BITSET_TEST(visited, nblock->index))
+		return;
+
+	BITSET_SET(visited, nblock->index);
+
+	block = get_block(ctx, nblock);
+	bd = block->bd;
+
+	if (bd && bd->arrs[av->aid]) {
+		struct ir3_array_value *dav = bd->arrs[av->aid];
+		for (unsigned i = 0; i < av->length; i++) {
+			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
+					dav->arr[i];
+		}
+	} else {
+		/* didn't find defn, recurse predecessors: */
+		struct set_entry *entry;
+		set_foreach(nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
+}
+
+static void
+resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	struct ir3_nir_block_data *bd = block->bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
+
+	if (!bd)
+		return;
+
+	/* TODO use nir dom_frontier to help us with this? */
+
+	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
+		struct ir3_array_value *av = bd->arrs[i];
+		BITSET_WORD visited[bitset_words];
+		struct set_entry *entry;
+
+		if (!(av && av->phis))
+			continue;
+
+		memset(visited, 0, sizeof(visited));
+		set_foreach(block->nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
 }
 
 /* allocate a n element value array (to be populated by caller) and
@@ -393,7 +569,8 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src)
 	instr->regs[1]->flags |= IR3_REG_HALF;
 
 	instr = ir3_MOV(block, instr, TYPE_S16);
-	instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF;
+	instr->regs[0]->num = regid(REG_A0, 0);
+	instr->regs[0]->flags |= IR3_REG_HALF;
 	instr->regs[1]->flags |= IR3_REG_HALF;
 
 	return instr;
@@ -419,6 +596,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
 }
 
 static struct ir3_instruction *
+get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *cond;
+
+	/* NOTE: only cmps.*.* can write p0.x: */
+	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+	cond->cat2.condition = IR3_COND_NE;
+
+	/* condition always goes in predicate register: */
+	cond->regs[0]->num = regid(REG_P0, 0);
+
+	return cond;
+}
+
+static struct ir3_instruction *
 create_uniform(struct ir3_compile *ctx, unsigned n)
 {
 	struct ir3_instruction *mov;
@@ -461,7 +654,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 		return NULL;
 
 	collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
-	ir3_reg_create(collect, 0, 0);
+	ir3_reg_create(collect, 0, 0);     /* dst */
 	for (unsigned i = 0; i < arrsz; i++)
 		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
 
@@ -597,6 +790,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 		compile_assert(ctx, !ctx->frag_face);
 
 		ctx->frag_face = create_input(block, NULL, 0);
+		ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 
 		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
 		 * positive vs negative float.. and piglit further seems to
@@ -628,10 +822,10 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
  */
 static void
 split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-		struct ir3_instruction *src)
+		struct ir3_instruction *src, unsigned n)
 {
 	struct ir3_instruction *prev = NULL;
-	for (int i = 0, j = 0; i < 4; i++) {
+	for (int i = 0, j = 0; i < n; i++) {
 		struct ir3_instruction *split =
 				ir3_instr_create(block, -1, OPC_META_FO);
 		ir3_reg_create(split, 0, IR3_REG_SSA);
@@ -882,9 +1076,15 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
 	case nir_op_imax:
 		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
 		break;
+	case nir_op_umax:
+		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+		break;
 	case nir_op_imin:
 		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
 		break;
+	case nir_op_umin:
+		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+		break;
 	case nir_op_imul:
 		/*
 		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
@@ -1030,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1070,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 	struct ir3_instruction **src;
 
 	compile_assert(ctx, dvar->deref.child &&
@@ -1140,8 +1340,8 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
 	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
 	so->total_in++;
 
-	ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1);
-	ctx->block->inputs[r] = instr;
+	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+	ctx->ir->inputs[r] = instr;
 }
 
 static void
@@ -1154,18 +1354,18 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 
 	if (info->has_dest) {
 		dst = get_dst(ctx, &intr->dest, intr->num_components);
+	} else {
+		dst = NULL;
 	}
 
 	switch (intr->intrinsic) {
 	case nir_intrinsic_load_uniform:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
 			dst[i] = create_uniform(ctx, n);
 		}
 		break;
 	case nir_intrinsic_load_uniform_indirect:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
@@ -1178,21 +1378,20 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinsic_load_ubo(ctx, intr, dst);
 		break;
 	case nir_intrinsic_load_input:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			dst[i] = b->inputs[n];
+			dst[i] = ctx->ir->inputs[n];
 		}
 		break;
 	case nir_intrinsic_load_input_indirect:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		struct ir3_instruction *collect =
-				create_collect(b, b->inputs, b->ninputs);
+				create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
 		struct ir3_instruction *addr = get_addr(ctx, src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect);
+			dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+					n, addr, collect);
 		}
 		break;
 	case nir_intrinsic_load_var:
@@ -1202,11 +1401,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinisic_store_var(ctx, intr);
 		break;
 	case nir_intrinsic_store_output:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			b->outputs[n] = src[i];
+			ctx->ir->outputs[n] = src[i];
 		}
 		break;
 	case nir_intrinsic_load_base_vertex:
@@ -1248,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			cond = create_immed(b, 1);
 		}
 
+		/* NOTE: only cmps.*.* can write p0.x: */
 		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
 		cond->cat2.condition = IR3_COND_NE;
 
@@ -1255,6 +1454,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		cond->regs[0]->num = regid(REG_P0, 0);
 
 		kill = ir3_KILL(b, cond, 0);
+		array_insert(ctx->ir->predicates, kill);
 
 		ctx->kill[ctx->kill_count++] = kill;
 		ctx->so->has_kill = true;
@@ -1318,6 +1518,8 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
 		coords = 3;
 		flags |= IR3_INSTR_3D;
 		break;
+	default:
+		unreachable("bad sampler_dim");
 	}
 
 	if (tex->is_shadow)
@@ -1340,7 +1542,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	unsigned i, coords, flags;
 	unsigned nsrc0 = 0, nsrc1 = 0;
 	type_t type;
-	opc_t opc;
+	opc_t opc = 0;
+
+	coord = off = ddx = ddy = NULL;
+	lod = proj = compare = NULL;
 
 	/* TODO: might just be one component for gathers? */
 	dst = get_dst(ctx, &tex->dest, 4);
@@ -1400,11 +1605,12 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	tex_info(tex, &flags, &coords);
 
 	/* scale up integer coords for TXF based on the LOD */
-	if (opc == OPC_ISAML) {
+	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
 		assert(has_lod);
 		for (i = 0; i < coords; i++)
 			coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
 	}
+
 	/*
 	 * lay out the first argument in the proper order:
 	 *  - actual coordinates first
@@ -1484,6 +1690,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	case nir_type_bool:
 		type = TYPE_U32;
 		break;
+	default:
+		unreachable("bad dest_type");
 	}
 
 	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
@@ -1491,7 +1699,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			create_collect(b, src0, nsrc0),
 			create_collect(b, src1, nsrc1));
 
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 4);
 }
 
 static void
@@ -1508,7 +1716,7 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
 	/* even though there is only one component, since it ends
 	 * up in .z rather than .x, we need a split_dest()
 	 */
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 3);
 
 	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
 	 * the value in TEX_CONST_0 is zero-based.
@@ -1536,7 +1744,7 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
 			tex->sampler_index, tex->sampler_index, lod, NULL);
 
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 4);
 
 	/* Array size actually ends up in .w rather than .z. This doesn't
 	 * matter for miplevel 0, but for higher mips the value in z is
@@ -1553,6 +1761,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 }
 
 static void
+emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
+{
+	struct ir3_instruction *phi, **dst;
+
+	/* NOTE: phi's should be lowered to scalar at this point */
+	compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+
+	dst = get_dst(ctx, &nphi->dest, 1);
+
+	phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+			1 + exec_list_length(&nphi->srcs));
+	ir3_reg_create(phi, 0, 0);         /* dst */
+	phi->phi.nphi = nphi;
+
+	dst[0] = phi;
+}
+
+/* phi instructions are left partially constructed.  We don't resolve
+ * their srcs until the end of the block, since (eg. loops) one of
+ * the phi's srcs might be defined after the phi due to back edges in
+ * the CFG.
+ */
+static void
+resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		nir_phi_instr *nphi;
+
+		/* phi's only come at start of block: */
+		if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+			break;
+
+		if (!instr->phi.nphi)
+			break;
+
+		nphi = instr->phi.nphi;
+		instr->phi.nphi = NULL;
+
+		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
+			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+		}
+	}
+
+	resolve_array_phis(ctx, block);
+}
+
+static void
+emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
+{
+	switch (jump->type) {
+	case nir_jump_break:
+	case nir_jump_continue:
+		/* I *think* we can simply just ignore this, and use the
+		 * successor block link to figure out where we need to
+		 * jump to for break/continue
+		 */
+		break;
+	default:
+		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+		break;
+	}
+}
+
+static void
 emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 {
 	switch (instr->type) {
@@ -1585,45 +1858,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		}
 		break;
 	}
-	case nir_instr_type_call:
-	case nir_instr_type_jump:
 	case nir_instr_type_phi:
+		emit_phi(ctx, nir_instr_as_phi(instr));
+		break;
+	case nir_instr_type_jump:
+		emit_jump(ctx, nir_instr_as_jump(instr));
+		break;
+	case nir_instr_type_call:
 	case nir_instr_type_parallel_copy:
 		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
 		break;
 	}
 }
 
+static struct ir3_block *
+get_block(struct ir3_compile *ctx, nir_block *nblock)
+{
+	struct ir3_block *block;
+	struct hash_entry *entry;
+	entry = _mesa_hash_table_search(ctx->block_ht, nblock);
+	if (entry)
+		return entry->data;
+
+	block = ir3_block_create(ctx->ir);
+	block->nblock = nblock;
+	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+	return block;
+}
+
 static void
-emit_block(struct ir3_compile *ctx, nir_block *block)
+emit_block(struct ir3_compile *ctx, nir_block *nblock)
 {
-	nir_foreach_instr(block, instr) {
+	struct ir3_block *block = get_block(ctx, nblock);
+
+	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+		if (nblock->successors[i]) {
+			block->successors[i] =
+				get_block(ctx, nblock->successors[i]);
+		}
+	}
+
+	ctx->block = block;
+	list_addtail(&block->node, &ctx->ir->block_list);
+
+	nir_foreach_instr(nblock, instr) {
 		emit_instr(ctx, instr);
 		if (ctx->error)
 			return;
 	}
 }
 
+static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
+
 static void
-emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+emit_if(struct ir3_compile *ctx, nir_if *nif)
+{
+	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+	ctx->block->condition =
+		get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+	emit_cf_list(ctx, &nif->then_list);
+	emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
+{
+	emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 {
-	foreach_list_typed(nir_cf_node, node, node, &impl->body) {
+	foreach_list_typed(nir_cf_node, node, node, list) {
 		switch (node->type) {
 		case nir_cf_node_block:
 			emit_block(ctx, nir_cf_node_as_block(node));
 			break;
 		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
 		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
 		case nir_cf_node_function:
 			compile_error(ctx, "TODO\n");
 			break;
 		}
-		if (ctx->error)
-			return;
 	}
 }
 
 static void
+emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+{
+	emit_cf_list(ctx, &impl->body);
+	emit_block(ctx, impl->end_block);
+
+	/* at this point, we should have a single empty block,
+	 * into which we emit the 'end' instruction.
+	 */
+	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+	ir3_END(ctx->block);
+}
+
+static void
 setup_input(struct ir3_compile *ctx, nir_variable *in)
 {
 	struct ir3_shader_variant *so = ctx->so;
@@ -1708,7 +2048,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 			instr = create_input(ctx->block, NULL, idx);
 		}
 
-		ctx->block->inputs[idx] = instr;
+		ctx->ir->inputs[idx] = instr;
 	}
 
 	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
@@ -1775,15 +2115,26 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 	for (int i = 0; i < ncomp; i++) {
 		unsigned idx = (n * 4) + i;
 
-		ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0));
+		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
 	}
 }
 
 static void
 emit_instructions(struct ir3_compile *ctx)
 {
-	unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
-	unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
+	unsigned ninputs, noutputs;
+	nir_function_impl *fxn = NULL;
+
+	/* Find the main function: */
+	nir_foreach_overload(ctx->s, overload) {
+		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
+		compile_assert(ctx, overload->impl);
+		fxn = overload->impl;
+		break;
+	}
+
+	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
+	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
 	/* we need to allocate big enough outputs array so that
 	 * we can stuff the kill's at the end.  Likewise for vtx
@@ -1795,12 +2146,17 @@ emit_instructions(struct ir3_compile *ctx)
 		ninputs += 8;
 	}
 
-	ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs);
+	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+	/* Create inputs in first block: */
+	ctx->block = get_block(ctx, fxn->start_block);
+	ctx->in_block = ctx->block;
+	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
 	if (ctx->so->type == SHADER_FRAGMENT) {
-		ctx->block->noutputs -= ARRAY_SIZE(ctx->kill);
+		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
 	} else if (ctx->so->type == SHADER_VERTEX) {
-		ctx->block->ninputs -= 8;
+		ctx->ir->ninputs -= 8;
 	}
 
 	/* for fragment shader, we have a single input register (usually
@@ -1831,13 +2187,12 @@ emit_instructions(struct ir3_compile *ctx)
 		declare_var(ctx, var);
 	}
 
-	/* Find the main function and emit the body: */
-	nir_foreach_overload(ctx->s, overload) {
-		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
-		compile_assert(ctx, overload->impl);
-		emit_function(ctx, overload->impl);
-		if (ctx->error)
-			return;
+	/* And emit the body: */
+	ctx->impl = fxn;
+	emit_function(ctx, fxn);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		resolve_phis(ctx, block);
 	}
 }
 
@@ -1850,12 +2205,12 @@ static void
 fixup_frag_inputs(struct ir3_compile *ctx)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	struct ir3_block *block = ctx->block;
+	struct ir3 *ir = ctx->ir;
 	struct ir3_instruction **inputs;
 	struct ir3_instruction *instr;
 	int n, regid = 0;
 
-	block->ninputs = 0;
+	ir->ninputs = 0;
 
 	n  = 4;  /* always have frag_pos */
 	n += COND(so->frag_face, 4);
@@ -1867,15 +2222,15 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 		/* this ultimately gets assigned to hr0.x so doesn't conflict
 		 * with frag_coord/frag_pos..
 		 */
-		inputs[block->ninputs++] = ctx->frag_face;
+		inputs[ir->ninputs++] = ctx->frag_face;
 		ctx->frag_face->regs[0]->num = 0;
 
 		/* remaining channels not used, but let's avoid confusing
 		 * other parts that expect inputs to come in groups of vec4
 		 */
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
 	}
 
 	/* since we don't know where to set the regid for frag_coord,
@@ -1889,63 +2244,45 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 		ctx->frag_coord[2]->regs[0]->num = regid++;
 		ctx->frag_coord[3]->regs[0]->num = regid++;
 
-		inputs[block->ninputs++] = ctx->frag_coord[0];
-		inputs[block->ninputs++] = ctx->frag_coord[1];
-		inputs[block->ninputs++] = ctx->frag_coord[2];
-		inputs[block->ninputs++] = ctx->frag_coord[3];
+		inputs[ir->ninputs++] = ctx->frag_coord[0];
+		inputs[ir->ninputs++] = ctx->frag_coord[1];
+		inputs[ir->ninputs++] = ctx->frag_coord[2];
+		inputs[ir->ninputs++] = ctx->frag_coord[3];
 	}
 
 	/* we always have frag_pos: */
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(block, NULL, block->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
+	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(block, NULL, block->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
+	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;
 
-	block->inputs = inputs;
-}
-
-static void
-compile_dump(struct ir3_compile *ctx)
-{
-	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
-	static unsigned n = 0;
-	char fname[16];
-	FILE *f;
-	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
-	f = fopen(fname, "w");
-	if (!f)
-		return;
-	ir3_block_depth(ctx->block);
-	ir3_dump(ctx->ir, name, ctx->block, f);
-	fclose(f);
+	ir->inputs = inputs;
 }
 
 int
-ir3_compile_shader_nir(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key)
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens,
+		struct ir3_shader_key key)
 {
 	struct ir3_compile *ctx;
-	struct ir3_block *block;
+	struct ir3 *ir;
 	struct ir3_instruction **inputs;
 	unsigned i, j, actual_in;
 	int ret = 0, max_bary;
 
 	assert(!so->ir);
 
-	so->ir = ir3_create();
-
-	assert(so->ir);
-
-	ctx = compile_init(so, tokens);
+	ctx = compile_init(compiler, so, tokens);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
@@ -1960,11 +2297,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 		goto out;
 	}
 
-	block = ctx->block;
-	so->ir->block = block;
+	ir = so->ir = ctx->ir;
 
 	/* keep track of the inputs from TGSI perspective.. */
-	inputs = block->inputs;
+	inputs = ir->inputs;
 
 	/* but fixup actual inputs for frag shader: */
 	if (so->type == SHADER_FRAGMENT)
@@ -1981,26 +2317,39 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 					(name == TGSI_SEMANTIC_PSIZE))) {
 				if (i != j) {
 					so->outputs[j] = so->outputs[i];
-					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
-					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
-					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
-					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+					ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+					ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+					ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+					ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
 				}
 				j++;
 			}
 		}
 		so->outputs_count = j;
-		block->noutputs = j * 4;
+		ir->noutputs = j * 4;
 	}
 
 	/* if we want half-precision outputs, mark the output registers
 	 * as half:
 	 */
 	if (key.half_precision) {
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
+		for (i = 0; i < ir->noutputs; i++) {
+			struct ir3_instruction *out = ir->outputs[i];
+			if (!out)
 				continue;
-			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
+			out->regs[0]->flags |= IR3_REG_HALF;
+			/* output could be a fanout (ie. texture fetch output)
+			 * in which case we need to propagate the half-reg flag
+			 * up to the definer so that RA sees it:
+			 */
+			if (is_meta(out) && (out->opc == OPC_META_FO)) {
+				out = out->regs[1]->instr;
+				out->regs[0]->flags |= IR3_REG_HALF;
+			}
+
+			if (out->category == 1) {
+				out->cat1.dst_type = half_type(out->cat1.dst_type);
+			}
 		}
 	}
 
@@ -2010,42 +2359,34 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 	 */
 	if (so->type == SHADER_FRAGMENT) {
 		for (i = 0; i < ctx->kill_count; i++)
-			block->outputs[block->noutputs++] = ctx->kill[i];
+			ir->outputs[ir->noutputs++] = ctx->kill[i];
 	}
 
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(ctx);
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE CP:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ir3_block_depth(block);
-
-	ir3_block_cp(block);
+	ir3_cp(ir);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE GROUPING:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
 	/* Group left/right neighbors, inserting mov's where needed to
 	 * solve conflicts:
 	 */
-	ir3_block_group(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(ctx);
+	ir3_group(ir);
 
-	ir3_block_depth(block);
+	ir3_depth(ir);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER DEPTH:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ret = ir3_block_sched(block);
+	ret = ir3_sched(ir);
 	if (ret) {
 		DBG("SCHED failed!");
 		goto out;
@@ -2053,10 +2394,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER SCHED:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
+	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
 	if (ret) {
 		DBG("RA failed!");
 		goto out;
@@ -2064,14 +2405,19 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER RA:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
+	ir3_legalize(ir, &so->has_samp, &max_bary);
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
 
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
+		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
 		/* preserve hack for depth output.. tgsi writes depth to .z,
 		 * but what we give the hw is the scalar register:
 		 */
@@ -2111,7 +2457,8 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 out:
 	if (ret) {
-		ir3_destroy(so->ir);
+		if (so->ir)
+			ir3_destroy(so->ir);
 		so->ir = NULL;
 	}
 	compile_free(ctx);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index fa7d363be7b..8c7c80f7aae 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -41,7 +41,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
-		if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV))
+		if (dst->flags & IR3_REG_RELATIV)
 			return false;
 		if (src->flags & IR3_REG_RELATIV)
 			return false;
@@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
+		/* TODO: we currently don't handle left/right neighbors
+		 * very well when inserting parallel-copies into phi..
+		 * to avoid problems don't eliminate a mov coming out
+		 * of phi..
+		 */
+		if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+			return false;
 		return true;
 	}
 	return false;
@@ -354,13 +361,6 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 {
 	struct ir3_register *reg;
 
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return instr;
-
 	if (is_eligible_mov(instr, !!flags)) {
 		struct ir3_register *reg = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(reg);
@@ -394,22 +394,22 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 	return instr;
 }
 
-static void block_cp(struct ir3_block *block)
+void
+ir3_cp(struct ir3 *ir)
 {
-	unsigned i;
+	ir3_clear_mark(ir);
 
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
 			struct ir3_instruction *out =
-					instr_cp(block->outputs[i], NULL);
+					instr_cp(ir->outputs[i], NULL);
 
-			block->outputs[i] = out;
+			ir->outputs[i] = out;
 		}
 	}
-}
 
-void ir3_block_cp(struct ir3_block *block)
-{
-	ir3_clear_mark(block->shader);
-	block_cp(block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			block->condition = instr_cp(block->condition, NULL);
+	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index b899c66b37e..3a108243479 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -84,25 +84,25 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 	}
 }
 
-static void insert_by_depth(struct ir3_instruction *instr)
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
 {
-	struct ir3_block *block = instr->block;
-	struct ir3_instruction *n = block->head;
-	struct ir3_instruction *p = NULL;
-
-	while (n && (n != instr) && (n->depth > instr->depth)) {
-		p = n;
-		n = n->next;
+	/* remove from existing spot in list: */
+	list_delinit(&instr->node);
+
+	/* find where to re-insert instruction: */
+	list_for_each_entry (struct ir3_instruction, pos, list, node) {
+		if (pos->depth > instr->depth) {
+			list_add(&instr->node, &pos->node);
+			return;
+		}
 	}
-
-	instr->next = n;
-	if (p)
-		p->next = instr;
-	else
-		block->head = instr;
+	/* if we get here, we didn't find an insertion spot: */
+	list_addtail(&instr->node, list);
 }
 
-static void ir3_instr_depth(struct ir3_instruction *instr)
+static void
+ir3_instr_depth(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *src;
 
@@ -123,47 +123,54 @@ static void ir3_instr_depth(struct ir3_instruction *instr)
 		instr->depth = MAX2(instr->depth, sd);
 	}
 
-	/* meta-instructions don't add cycles, other than PHI.. which
-	 * might translate to a real instruction..
-	 *
-	 * well, not entirely true, fan-in/out, etc might need to need
-	 * to generate some extra mov's in edge cases, etc.. probably
-	 * we might want to do depth calculation considering the worst
-	 * case for these??
-	 */
 	if (!is_meta(instr))
 		instr->depth++;
 
-	insert_by_depth(instr);
+	ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static void
+remove_unused_by_block(struct ir3_block *block)
+{
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (!ir3_instr_check_mark(instr)) {
+			if (is_flow(instr) && (instr->opc == OPC_END))
+				continue;
+			/* mark it, in case it is input, so we can
+			 * remove unused inputs:
+			 */
+			instr->depth = DEPTH_UNUSED;
+			/* and remove from instruction list: */
+			list_delinit(&instr->node);
+		}
+	}
 }
 
-void ir3_block_depth(struct ir3_block *block)
+void
+ir3_depth(struct ir3 *ir)
 {
 	unsigned i;
 
-	block->head = NULL;
+	ir3_clear_mark(ir);
+	for (i = 0; i < ir->noutputs; i++)
+		if (ir->outputs[i])
+			ir3_instr_depth(ir->outputs[i]);
 
-	ir3_clear_mark(block->shader);
-	for (i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			ir3_instr_depth(block->outputs[i]);
+	/* We also need to account for if-condition: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			ir3_instr_depth(block->condition);
+	}
 
 	/* mark un-used instructions: */
-	for (i = 0; i < block->shader->instrs_count; i++) {
-		struct ir3_instruction *instr = block->shader->instrs[i];
-
-		/* just consider instructions within this block: */
-		if (instr->block != block)
-			continue;
-
-		if (!ir3_instr_check_mark(instr))
-			instr->depth = DEPTH_UNUSED;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		remove_unused_by_block(block);
 	}
 
 	/* cleanup unused inputs: */
-	for (i = 0; i < block->ninputs; i++) {
-		struct ir3_instruction *in = block->inputs[i];
+	for (i = 0; i < ir->ninputs; i++) {
+		struct ir3_instruction *in = ir->inputs[i];
 		if (in && (in->depth == DEPTH_UNUSED))
-			block->inputs[i] = NULL;
+			ir->inputs[i] = NULL;
 	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
deleted file mode 100644
index 1614d637b13..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c
+++ /dev/null
@@ -1,456 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-#define PTRID(x) ((unsigned long)(x))
-
-struct ir3_dump_ctx {
-	FILE *f;
-	bool verbose;
-};
-
-static void dump_instr_name(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	/* for debugging: */
-	if (ctx->verbose) {
-#ifdef DEBUG
-		fprintf(ctx->f, "%04u:", instr->serialno);
-#endif
-		fprintf(ctx->f, "%03u: ", instr->depth);
-	}
-
-	if (instr->flags & IR3_INSTR_SY)
-		fprintf(ctx->f, "(sy)");
-	if (instr->flags & IR3_INSTR_SS)
-		fprintf(ctx->f, "(ss)");
-
-	if (is_meta(instr)) {
-		switch(instr->opc) {
-		case OPC_META_PHI:
-			fprintf(ctx->f, "&#934;");
-			break;
-		default:
-			/* shouldn't hit here.. just for debugging: */
-			switch (instr->opc) {
-			case OPC_META_INPUT:  fprintf(ctx->f, "_meta:in");   break;
-			case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out");  break;
-			case OPC_META_FO:     fprintf(ctx->f, "_meta:fo");   break;
-			case OPC_META_FI:     fprintf(ctx->f, "_meta:fi");   break;
-			case OPC_META_FLOW:   fprintf(ctx->f, "_meta:flow"); break;
-
-			default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
-			}
-			break;
-		}
-	} else if (instr->category == 1) {
-		static const char *type[] = {
-				[TYPE_F16] = "f16",
-				[TYPE_F32] = "f32",
-				[TYPE_U16] = "u16",
-				[TYPE_U32] = "u32",
-				[TYPE_S16] = "s16",
-				[TYPE_S32] = "s32",
-				[TYPE_U8]  = "u8",
-				[TYPE_S8]  = "s8",
-		};
-		if (instr->cat1.src_type == instr->cat1.dst_type)
-			fprintf(ctx->f, "mov");
-		else
-			fprintf(ctx->f, "cov");
-		fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
-	} else {
-		fprintf(ctx->f, "%s", ir3_instr_name(instr));
-		if (instr->flags & IR3_INSTR_3D)
-			fprintf(ctx->f, ".3d");
-		if (instr->flags & IR3_INSTR_A)
-			fprintf(ctx->f, ".a");
-		if (instr->flags & IR3_INSTR_O)
-			fprintf(ctx->f, ".o");
-		if (instr->flags & IR3_INSTR_P)
-			fprintf(ctx->f, ".p");
-		if (instr->flags & IR3_INSTR_S)
-			fprintf(ctx->f, ".s");
-		if (instr->flags & IR3_INSTR_S2EN)
-			fprintf(ctx->f, ".s2en");
-	}
-}
-
-static void dump_reg_name(struct ir3_dump_ctx *ctx,
-		struct ir3_register *reg, bool followssa)
-{
-	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
-			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
-		fprintf(ctx->f, "(absneg)");
-	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
-		fprintf(ctx->f, "(neg)");
-	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
-		fprintf(ctx->f, "(abs)");
-
-	if (reg->flags & IR3_REG_IMMED) {
-		fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_SSA) {
-		if (ctx->verbose) {
-			fprintf(ctx->f, "_");
-			if (followssa) {
-				fprintf(ctx->f, "[");
-				dump_instr_name(ctx, reg->instr);
-				fprintf(ctx->f, "]");
-			}
-		}
-	} else if (reg->flags & IR3_REG_RELATIV) {
-		if (reg->flags & IR3_REG_HALF)
-			fprintf(ctx->f, "h");
-		if (reg->flags & IR3_REG_CONST)
-			fprintf(ctx->f, "c<a0.x + %u>", reg->num);
-		else
-			fprintf(ctx->f, "\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
-	} else {
-		if (reg->flags & IR3_REG_HALF)
-			fprintf(ctx->f, "h");
-		if (reg->flags & IR3_REG_CONST)
-			fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-		else
-			fprintf(ctx->f, "\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
-	}
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr);
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_block *block, const char *name);
-
-static void dump_instr(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	/* some meta-instructions need to be handled specially: */
-	if (is_meta(instr)) {
-		if ((instr->opc == OPC_META_FO) ||
-				(instr->opc == OPC_META_FI)) {
-			struct ir3_instruction *src;
-			foreach_ssa_src(src, instr)
-				dump_instr(ctx, src);
-		} else if (instr->opc == OPC_META_FLOW) {
-			struct ir3_register *reg = instr->regs[1];
-			ir3_block_dump(ctx, instr->flow.if_block, "if");
-			if (instr->flow.else_block)
-				ir3_block_dump(ctx, instr->flow.else_block, "else");
-			if (reg->flags & IR3_REG_SSA)
-				dump_instr(ctx, reg->instr);
-		} else if (instr->opc == OPC_META_PHI) {
-			/* treat like a normal instruction: */
-			ir3_instr_dump(ctx, instr);
-		}
-	} else {
-		ir3_instr_dump(ctx, instr);
-	}
-}
-
-/* arrarraggh!  if link is to something outside of the current block, we
- * need to defer emitting the link until the end of the block, since the
- * edge triggers pre-creation of the node it links to inside the cluster,
- * even though it is meant to be outside..
- */
-static struct {
-	char buf[40960];
-	unsigned n;
-} edge_buf;
-
-/* helper to print or defer: */
-static void printdef(struct ir3_dump_ctx *ctx,
-		bool defer, const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	if (defer) {
-		unsigned n = edge_buf.n;
-		n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
-				fmt, ap);
-		edge_buf.n = n;
-	} else {
-		vfprintf(ctx->f, fmt, ap);
-	}
-	va_end(ap);
-}
-
-static void dump_link2(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr, const char *target, bool defer)
-{
-	/* some meta-instructions need to be handled specially: */
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_INPUT) {
-			printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
-					PTRID(instr->inout.block),
-					instr->regs[0]->num, target);
-		} else if (instr->opc == OPC_META_FO) {
-			struct ir3_register *reg = instr->regs[1];
-			dump_link2(ctx, reg->instr, target, defer);
-			printdef(ctx, defer, "[label=\".%c\"]",
-					"xyzw"[instr->fo.off & 0x3]);
-		} else if (instr->opc == OPC_META_FI) {
-			struct ir3_instruction *src;
-
-			foreach_ssa_src_n(src, i, instr) {
-				dump_link2(ctx, src, target, defer);
-				printdef(ctx, defer, "[label=\".%c\"]",
-						"xyzw"[i & 0x3]);
-			}
-		} else if (instr->opc == OPC_META_OUTPUT) {
-			printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
-					PTRID(instr->inout.block),
-					instr->regs[0]->num, target);
-		} else if (instr->opc == OPC_META_PHI) {
-			/* treat like a normal instruction: */
-			printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-		}
-	} else {
-		printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-	}
-}
-
-static void dump_link(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr,
-		struct ir3_block *block, const char *target)
-{
-	bool defer = instr->block != block;
-	dump_link2(ctx, instr, target, defer);
-	printdef(ctx, defer, "\n");
-}
-
-static struct ir3_register *follow_flow(struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_SSA) {
-		struct ir3_instruction *instr = reg->instr;
-		/* go with the flow.. */
-		if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
-			return instr->regs[1];
-	}
-	return reg;
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_register *src;
-
-	fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
-			PTRID(instr));
-	dump_instr_name(ctx, instr);
-
-	/* destination register: */
-	fprintf(ctx->f, "|<dst0>");
-
-	/* source register(s): */
-	foreach_src_n(src, i, instr) {
-		struct ir3_register *reg = follow_flow(src);
-
-		fprintf(ctx->f, "|");
-
-		if (reg->flags & IR3_REG_SSA)
-			fprintf(ctx->f, "<src%u> ", i);
-
-		dump_reg_name(ctx, reg, true);
-	}
-
-	fprintf(ctx->f, "}\"];\n");
-
-	/* and recursively dump dependent instructions: */
-	foreach_src_n(src, i, instr) {
-		struct ir3_register *reg = follow_flow(src);
-		char target[32];  /* link target */
-
-		if (!(reg->flags & IR3_REG_SSA))
-			continue;
-
-		snprintf(target, sizeof(target), "instr%lx:<src%u>",
-				PTRID(instr), i);
-
-		dump_instr(ctx, reg->instr);
-		dump_link(ctx, reg->instr, instr->block, target);
-	}
-}
-
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_block *block, const char *name)
-{
-	unsigned i, n;
-
-	n = edge_buf.n;
-
-	fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
-	fprintf(ctx->f, "label=\"%s\";\n", name);
-
-	/* draw inputs: */
-	fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
-	for (i = 0; i < block->ninputs; i++)
-		if (block->inputs[i])
-			fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-	fprintf(ctx->f, "\"];\n");
-
-	/* draw instruction graph: */
-	for (i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			dump_instr(ctx, block->outputs[i]);
-
-	/* draw outputs: */
-	fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
-	for (i = 0; i < block->noutputs; i++)
-		fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-	fprintf(ctx->f, "\"];\n");
-
-	/* and links to outputs: */
-	for (i = 0; i < block->noutputs; i++) {
-		char target[32];  /* link target */
-
-		/* NOTE: there could be outputs that are never assigned,
-		 * so skip them
-		 */
-		if (!block->outputs[i])
-			continue;
-
-		snprintf(target, sizeof(target), "output%lx:<out%u>:e",
-				PTRID(block), i);
-
-		dump_link(ctx, block->outputs[i], block, target);
-	}
-
-	fprintf(ctx->f, "}\n");
-
-	/* and links to inputs: */
-	if (block->parent) {
-		for (i = 0; i < block->ninputs; i++) {
-			char target[32];  /* link target */
-
-			if (!block->inputs[i])
-				continue;
-
-			dump_instr(ctx, block->inputs[i]);
-
-			snprintf(target, sizeof(target), "input%lx:<in%u>:e",
-					PTRID(block), i);
-
-			dump_link(ctx, block->inputs[i], block, target);
-		}
-	}
-
-	/* dump deferred edges: */
-	if (edge_buf.n > n) {
-		fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
-		edge_buf.n = n;
-	}
-}
-
-void ir3_dump(struct ir3 *shader, const char *name,
-		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-		FILE *f)
-{
-	struct ir3_dump_ctx ctx = {
-			.f = f,
-	};
-	ir3_clear_mark(shader);
-	fprintf(ctx.f, "digraph G {\n");
-	fprintf(ctx.f, "rankdir=RL;\n");
-	fprintf(ctx.f, "nodesep=0.25;\n");
-	fprintf(ctx.f, "ranksep=1.5;\n");
-	ir3_block_dump(&ctx, block, name);
-	fprintf(ctx.f, "}\n");
-}
-
-/*
- * For Debugging:
- */
-
-void
-ir3_dump_instr_single(struct ir3_instruction *instr)
-{
-	struct ir3_dump_ctx ctx = {
-			.f = stdout,
-			.verbose = true,
-	};
-	unsigned i;
-
-	dump_instr_name(&ctx, instr);
-	for (i = 0; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		printf(i ? ", " : " ");
-		dump_reg_name(&ctx, reg, !!i);
-	}
-
-	if (instr->address) {
-		fprintf(ctx.f, ", address=_");
-		fprintf(ctx.f, "[");
-		dump_instr_name(&ctx, instr->address);
-		fprintf(ctx.f, "]");
-	}
-
-	if (instr->fanin) {
-		fprintf(ctx.f, ", fanin=_");
-		fprintf(ctx.f, "[");
-		dump_instr_name(&ctx, instr->fanin);
-		fprintf(ctx.f, "]");
-	}
-
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_FO) {
-			printf(", off=%d", instr->fo.off);
-		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
-			printf(", aid=%d", instr->fi.aid);
-		}
-	}
-
-	printf("\n");
-}
-
-void
-ir3_dump_instr_list(struct ir3_instruction *instr)
-{
-	struct ir3_block *block = instr->block;
-	unsigned n = 0;
-
-	while (instr) {
-		ir3_dump_instr_single(instr);
-		if (!is_meta(instr))
-			n++;
-		instr = instr->next;
-	}
-	printf("%u instructions\n", n);
-
-	for (n = 0; n < block->noutputs; n++) {
-		if (!block->outputs[n])
-			continue;
-		printf("out%d: ", n);
-		ir3_dump_instr_single(block->outputs[n]);
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
deleted file mode 100644
index 419cd9dfcd4..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-/*
- * Flatten: flatten out legs of if/else, etc
- *
- * TODO probably should use some heuristic to decide to not flatten
- * if one side of the other is too large / deeply nested / whatever?
- */
-
-struct ir3_flatten_ctx {
-	struct ir3_block *block;
-	unsigned cnt;
-};
-
-static struct ir3_register *unwrap(struct ir3_register *reg)
-{
-
-	if (reg->flags & IR3_REG_SSA) {
-		struct ir3_instruction *instr = reg->instr;
-		if (is_meta(instr)) {
-			switch (instr->opc) {
-			case OPC_META_OUTPUT:
-			case OPC_META_FLOW:
-				if (instr->regs_count > 1)
-					return instr->regs[1];
-				return NULL;
-			default:
-				break;
-			}
-		}
-	}
-	return reg;
-}
-
-static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	instr->block = ctx->block;
-
-	/* TODO: maybe some threshold to decide whether to
-	 * flatten or not??
-	 */
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_PHI) {
-			struct ir3_register *cond, *t, *f;
-
-			cond = unwrap(instr->regs[1]);
-			t    = unwrap(instr->regs[2]);  /* true val */
-			f    = unwrap(instr->regs[3]);  /* false val */
-
-			/* must have cond, but t or f may be null if only written
-			 * one one side of the if/else (in which case we can just
-			 * convert the PHI to a simple move).
-			 */
-			assert(cond);
-			assert(t || f);
-
-			if (t && f) {
-				/* convert the PHI instruction to sel.{b16,b32} */
-				instr->category = 3;
-
-				/* instruction type based on dst size: */
-				if (instr->regs[0]->flags & IR3_REG_HALF)
-					instr->opc = OPC_SEL_B16;
-				else
-					instr->opc = OPC_SEL_B32;
-
-				instr->regs[1] = t;
-				instr->regs[2] = cond;
-				instr->regs[3] = f;
-			} else {
-				/* convert to simple mov: */
-				instr->category = 1;
-				instr->cat1.dst_type = TYPE_F32;
-				instr->cat1.src_type = TYPE_F32;
-				instr->regs_count = 2;
-				instr->regs[1] = t ? t : f;
-			}
-
-			ctx->cnt++;
-		} else if ((instr->opc == OPC_META_INPUT) &&
-				(instr->regs_count == 2)) {
-			type_t ftype;
-
-			if (instr->regs[0]->flags & IR3_REG_HALF)
-				ftype = TYPE_F16;
-			else
-				ftype = TYPE_F32;
-
-			/* convert meta:input to mov: */
-			instr->category = 1;
-			instr->cat1.src_type = ftype;
-			instr->cat1.dst_type = ftype;
-		}
-	}
-
-	/* recursively visit children: */
-	foreach_ssa_src(src, instr)
-		ir3_instr_flatten(ctx, src);
-}
-
-/* return >= 0 is # of phi's flattened, < 0 is error */
-int ir3_block_flatten(struct ir3_block *block)
-{
-	struct ir3_flatten_ctx ctx = {
-			.block = block,
-	};
-	unsigned i;
-
-	ir3_clear_mark(block->shader);
-	for(i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			ir3_instr_flatten(&ctx, block->outputs[i]);
-
-	return ctx.cnt;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 782f6e87e56..70d9b08e019 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -34,35 +34,6 @@
  * Find/group instruction neighbors:
  */
 
-/* stop condition for iteration: */
-static bool check_stop(struct ir3_instruction *instr)
-{
-	if (ir3_instr_check_mark(instr))
-		return true;
-
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return true;
-
-	return false;
-}
-
-static struct ir3_instruction * create_mov(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *mov;
-
-	mov = ir3_instr_create(instr->block, 1, 0);
-	mov->cat1.src_type = TYPE_F32;
-	mov->cat1.dst_type = TYPE_F32;
-	ir3_reg_create(mov, 0, 0);    /* dst */
-	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = instr;
-
-	return mov;
-}
-
 /* bleh.. we need to do the same group_n() thing for both inputs/outputs
  * (where we have a simple instr[] array), and fanin nodes (where we have
  * an extra indirection via reg->instr).
@@ -78,7 +49,8 @@ static struct ir3_instruction *arr_get(void *arr, int idx)
 }
 static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
 {
-	((struct ir3_instruction **)arr)[idx] = create_mov(instr);
+	((struct ir3_instruction **)arr)[idx] =
+			ir3_MOV(instr->block, instr, TYPE_F32);
 }
 static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
 {
@@ -111,14 +83,17 @@ static struct ir3_instruction *instr_get(void *arr, int idx)
 {
 	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
 }
-static void instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
 {
-	((struct ir3_instruction *)arr)->regs[idx+1]->instr = create_mov(instr);
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+			ir3_MOV(instr->block, instr, TYPE_F32);
 }
 static struct group_ops instr_ops = { instr_get, instr_insert_mov };
 
 
-static void group_n(struct group_ops *ops, void *arr, unsigned n)
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
 {
 	unsigned i, j;
 
@@ -141,6 +116,10 @@ restart:
 			conflict = conflicts(instr->cp.left, left) ||
 				conflicts(instr->cp.right, right);
 
+			/* RA can't yet deal very well w/ group'd phi's: */
+			if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+				conflict = true;
+
 			/* we also can't have an instr twice in the group: */
 			for (j = i + 1; (j < n) && !conflict; j++)
 				if (ops->get(arr, j) == instr)
@@ -181,11 +160,12 @@ restart:
 	}
 }
 
-static void instr_find_neighbors(struct ir3_instruction *instr)
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *src;
 
-	if (check_stop(instr))
+	if (ir3_instr_check_mark(instr))
 		return;
 
 	if (is_meta(instr) && (instr->opc == OPC_META_FI))
@@ -200,7 +180,8 @@ static void instr_find_neighbors(struct ir3_instruction *instr)
  * we need to insert dummy/padding instruction for grouping, and
  * then take it back out again before anyone notices.
  */
-static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
 {
 	int i, mask = 0;
 	struct ir3_block *block = NULL;
@@ -210,8 +191,8 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
 		if (instr) {
 			block = instr->block;
 		} else if (block) {
-			instr = ir3_instr_create(block, 0, OPC_NOP);
-			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dst */
+			instr = ir3_NOP(block);
+			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
 			input[i] = instr;
 			mask |= (1 << i);
 		}
@@ -225,42 +206,41 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
 	}
 }
 
-static void block_find_neighbors(struct ir3_block *block)
+static void
+find_neighbors(struct ir3 *ir)
 {
 	unsigned i;
 
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_find_neighbors(instr);
-		}
-	}
-
 	/* shader inputs/outputs themselves must be contiguous as well:
+	 *
+	 * NOTE: group inputs first, since we only insert mov's
+	 * *before* the conflicted instr (and that would go badly
+	 * for inputs).  By doing inputs first, we should never
+	 * have a conflict on inputs.. pushing any conflict to
+	 * resolve to the outputs, for stuff like:
+	 *
+	 *     MOV OUT[n], IN[m].wzyx
+	 *
+	 * NOTE: we assume here inputs/outputs are grouped in vec4.
+	 * This logic won't quite cut it if we don't align smaller
+	 * on vec4 boundaries
 	 */
-	if (!block->parent) {
-		/* NOTE: group inputs first, since we only insert mov's
-		 * *before* the conflicted instr (and that would go badly
-		 * for inputs).  By doing inputs first, we should never
-		 * have a conflict on inputs.. pushing any conflict to
-		 * resolve to the outputs, for stuff like:
-		 *
-		 *     MOV OUT[n], IN[m].wzyx
-		 *
-		 * NOTE: we assume here inputs/outputs are grouped in vec4.
-		 * This logic won't quite cut it if we don't align smaller
-		 * on vec4 boundaries
-		 */
-		for (i = 0; i < block->ninputs; i += 4)
-			pad_and_group_input(&block->inputs[i], 4);
-		for (i = 0; i < block->noutputs; i += 4)
-			group_n(&arr_ops_out, &block->outputs[i], 4);
-
+	for (i = 0; i < ir->ninputs; i += 4)
+		pad_and_group_input(&ir->inputs[i], 4);
+	for (i = 0; i < ir->noutputs; i += 4)
+		group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			struct ir3_instruction *instr = ir->outputs[i];
+			instr_find_neighbors(instr);
+		}
 	}
 }
 
-void ir3_block_group(struct ir3_block *block)
+void
+ir3_group(struct ir3 *ir)
 {
-	ir3_clear_mark(block->shader);
-	block_find_neighbors(block);
+	ir3_clear_mark(ir);
+	find_neighbors(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 2455f7e4efc..f4a4223ae17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -26,7 +26,6 @@
  *    Rob Clark <[email protected]>
  */
 
-#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 
 #include "freedreno_util.h"
@@ -43,20 +42,31 @@
  */
 
 struct ir3_legalize_ctx {
-	struct ir3_block *block;
 	bool has_samp;
 	int max_bary;
 };
 
-static void legalize(struct ir3_legalize_ctx *ctx)
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union
+ * of all possible program paths.  For stopping condition, we
+ * want to stop when the pair of <pred-block, current-block> has
+ * been visited already.
+ *
+ * XXX is that completely true?  We could have different needs_xyz
+ * flags set depending on path leading to pred-block.. we could
+ * do *most* of this based on chasing src instructions ptrs (and
+ * following all phi srcs).. except the write-after-read hazzard.
+ *
+ * For now we just set ss/sy flag on first instruction on block,
+ * and handle everything within the block as before.
+ */
+
+static void
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *n;
-	struct ir3 *shader = block->shader;
-	struct ir3_instruction *end =
-			ir3_instr_create(block, 0, OPC_END);
 	struct ir3_instruction *last_input = NULL;
 	struct ir3_instruction *last_rel = NULL;
+	struct list_head instr_list;
 	regmask_t needs_ss_war;       /* write after read */
 	regmask_t needs_ss;
 	regmask_t needs_sy;
@@ -65,9 +75,13 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 	regmask_init(&needs_ss);
 	regmask_init(&needs_sy);
 
-	shader->instrs_count = 0;
+	/* remove all the instructions from the list, we'll be adding
+	 * them back in as we go
+	 */
+	list_replace(&block->instr_list, &instr_list);
+	list_inithead(&block->instr_list);
 
-	for (n = block->head; n; n = n->next) {
+	list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
 		struct ir3_register *reg;
 		unsigned i;
 
@@ -134,18 +148,18 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 		 */
 		if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
 			struct ir3_instruction *nop;
-			nop = ir3_instr_create(block, 0, OPC_NOP);
+			nop = ir3_NOP(block);
 			nop->flags |= IR3_INSTR_SS;
 			n->flags &= ~IR3_INSTR_SS;
 		}
 
 		/* need to be able to set (ss) on first instruction: */
-		if ((shader->instrs_count == 0) && (n->category >= 5))
-			ir3_instr_create(block, 0, OPC_NOP);
+		if (list_empty(&block->instr_list) && (n->category >= 5))
+			ir3_NOP(block);
 
-		if (is_nop(n) && shader->instrs_count) {
-			struct ir3_instruction *last =
-					shader->instrs[shader->instrs_count-1];
+		if (is_nop(n) && !list_empty(&block->instr_list)) {
+			struct ir3_instruction *last = list_last_entry(&block->instr_list,
+					struct ir3_instruction, node);
 			if (is_nop(last) && (last->repeat < 5)) {
 				last->repeat++;
 				last->flags |= n->flags;
@@ -153,7 +167,7 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 			}
 		}
 
-		shader->instrs[shader->instrs_count++] = n;
+		list_addtail(&n->node, &block->instr_list);
 
 		if (is_sfu(n))
 			regmask_set(&needs_ss, n->regs[0]);
@@ -192,35 +206,20 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 		 * the (ei) flag:
 		 */
 		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
-			int i, cnt;
-
-			/* note that ir3_instr_create() inserts into
-			 * shader->instrs[] and increments the count..
-			 * so we need to bump up the cnt initially (to
-			 * avoid it clobbering the last real instr) and
-			 * restore it after.
-			 */
-			cnt = ++shader->instrs_count;
+			struct ir3_instruction *baryf;
 
-			/* inserting instructions would be a bit nicer if list.. */
-			for (i = cnt - 2; i >= 0; i--) {
-				if (shader->instrs[i] == last_input) {
+			/* (ss)bary.f (ei)r63.x, 0, r0.x */
+			baryf = ir3_instr_create(block, 2, OPC_BARY_F);
+			baryf->flags |= IR3_INSTR_SS;
+			ir3_reg_create(baryf, regid(63, 0), 0);
+			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+			ir3_reg_create(baryf, regid(0, 0), 0);
 
-					/* (ss)bary.f (ei)r63.x, 0, r0.x */
-					last_input = ir3_instr_create(block, 2, OPC_BARY_F);
-					last_input->flags |= IR3_INSTR_SS;
-					ir3_reg_create(last_input, regid(63, 0), 0);
-					ir3_reg_create(last_input, 0, IR3_REG_IMMED)->iim_val = 0;
-					ir3_reg_create(last_input, regid(0, 0), 0);
+			/* insert the dummy bary.f after last_input: */
+			list_delinit(&baryf->node);
+			list_add(&baryf->node, &last_input->node);
 
-					shader->instrs[i + 1] = last_input;
-
-					break;
-				}
-				shader->instrs[i + 1] = shader->instrs[i];
-			}
-
-			shader->instrs_count = cnt;
+			last_input = baryf;
 		}
 		last_input->regs[0]->flags |= IR3_REG_EI;
 	}
@@ -228,21 +227,177 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 	if (last_rel)
 		last_rel->flags |= IR3_INSTR_UL;
 
-	shader->instrs[shader->instrs_count++] = end;
+	list_first_entry(&block->instr_list, struct ir3_instruction, node)
+		->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *   	...
+ *   	0029:021: mov.s32s32 r62.x, r1.y
+ *   	0082:022: br !p0.x, target=block5
+ *   	0083:023: br p0.x, target=block4
+ *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *   	0084:024: jump, target=block6
+ *   	// succs: block6;
+ *   }
+ *   block5 {
+ *   	0085:025: jump, target=block7
+ *   	// succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+	/* special case for last block: */
+	if (!block->successors[0])
+		return block;
+
+	/* NOTE that we may or may not have inserted the jump
+	 * in the target block yet, so conditions to resolve
+	 * the dest to the dest block's successor are:
+	 *
+	 *   (1) successor[1] == NULL &&
+	 *   (2) (block-is-empty || only-instr-is-jump)
+	 */
+	if (block->successors[1] == NULL) {
+		if (list_empty(&block->instr_list)) {
+			return block->successors[0];
+		} else if (list_length(&block->instr_list) == 1) {
+			struct ir3_instruction *instr = list_first_entry(
+					&block->instr_list, struct ir3_instruction, node);
+			if (is_flow(instr) && (instr->opc == OPC_JUMP))
+				return block->successors[0];
+		}
+	}
+	return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+	struct ir3_block *tblock =
+		resolve_dest_block(instr->cat0.target);
+	struct ir3_instruction *target;
+
+	if (tblock != instr->cat0.target) {
+		list_delinit(&instr->cat0.target->node);
+		instr->cat0.target = tblock;
+		return true;
+	}
+
+	target = list_first_entry(&tblock->instr_list,
+				struct ir3_instruction, node);
+
+	if ((!target) || (target->ip == (instr->ip + 1))) {
+		list_delinit(&instr->node);
+		return true;
+	} else {
+		instr->cat0.immed =
+			(int)target->ip - (int)instr->ip;
+	}
+	return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+			if (is_flow(instr) && instr->cat0.target)
+				if (resolve_jump(instr))
+					return true;
+
+	return false;
+}
 
-	shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			if (is_flow(instr) && instr->cat0.target) {
+				struct ir3_instruction *target =
+					list_first_entry(&instr->cat0.target->instr_list,
+							struct ir3_instruction, node);
+				target->flags |= IR3_INSTR_JP;
+			}
+		}
+	}
 }
 
-void ir3_block_legalize(struct ir3_block *block,
-		bool *has_samp, int *max_bary)
+void
+ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
 {
 	struct ir3_legalize_ctx ctx = {
-			.block = block,
 			.max_bary = -1,
 	};
 
-	legalize(&ctx);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		legalize_block(&ctx, block);
+	}
 
 	*has_samp = ctx.has_samp;
 	*max_bary = ctx.max_bary;
+
+	do {
+		ir3_count_instructions(ir);
+	} while(resolve_jumps(ir));
+
+	mark_convergence_points(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index ae36019ed5f..dc9e4626f27 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -74,14 +74,13 @@ valid_dest(nir_block *block, nir_dest *dest)
 	 * (so this is run iteratively in a loop).  Therefore if
 	 * we get this far, it should not have any if_uses:
 	 */
-	assert(dest->ssa.if_uses->entries == 0);
+	assert(list_empty(&dest->ssa.if_uses));
 
 	/* The only uses of this definition must be phi's in the
 	 * successor or in the current block
 	 */
-	struct set_entry *entry;
-	set_foreach(dest->ssa.uses, entry) {
-		const nir_instr *dest_instr = entry->key;
+	nir_foreach_use(&dest->ssa, use) {
+		nir_instr *dest_instr = use->parent_instr;
 		if (dest_instr->block == block)
 			continue;
 		if ((dest_instr->type == nir_instr_type_phi) &&
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
new file mode 100644
index 00000000000..f377982dd5e
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -0,0 +1,237 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+#ifdef DEBUG
+	printf("%04u:", instr->serialno);
+#endif
+	printf("%03u: ", instr->depth);
+
+	if (instr->flags & IR3_INSTR_SY)
+		printf("(sy)");
+	if (instr->flags & IR3_INSTR_SS)
+		printf("(ss)");
+
+	if (is_meta(instr)) {
+		switch(instr->opc) {
+		case OPC_META_PHI:
+			printf("&#934;");
+			break;
+		default:
+			/* shouldn't hit here.. just for debugging: */
+			switch (instr->opc) {
+			case OPC_META_INPUT:  printf("_meta:in");   break;
+			case OPC_META_FO:     printf("_meta:fo");   break;
+			case OPC_META_FI:     printf("_meta:fi");   break;
+
+			default: printf("_meta:%d", instr->opc); break;
+			}
+			break;
+		}
+	} else if (instr->category == 1) {
+		static const char *type[] = {
+				[TYPE_F16] = "f16",
+				[TYPE_F32] = "f32",
+				[TYPE_U16] = "u16",
+				[TYPE_U32] = "u32",
+				[TYPE_S16] = "s16",
+				[TYPE_S32] = "s32",
+				[TYPE_U8]  = "u8",
+				[TYPE_S8]  = "s8",
+		};
+		if (instr->cat1.src_type == instr->cat1.dst_type)
+			printf("mov");
+		else
+			printf("cov");
+		printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+	} else {
+		printf("%s", ir3_instr_name(instr));
+		if (instr->flags & IR3_INSTR_3D)
+			printf(".3d");
+		if (instr->flags & IR3_INSTR_A)
+			printf(".a");
+		if (instr->flags & IR3_INSTR_O)
+			printf(".o");
+		if (instr->flags & IR3_INSTR_P)
+			printf(".p");
+		if (instr->flags & IR3_INSTR_S)
+			printf(".s");
+		if (instr->flags & IR3_INSTR_S2EN)
+			printf(".s2en");
+	}
+}
+
+static void print_reg_name(struct ir3_register *reg, bool followssa)
+{
+	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+		printf("(absneg)");
+	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+		printf("(neg)");
+	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+		printf("(abs)");
+
+	if (reg->flags & IR3_REG_IMMED) {
+		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_");
+		if (followssa) {
+			printf("[");
+			print_instr_name(reg->instr);
+			printf("]");
+		}
+	} else if (reg->flags & IR3_REG_RELATIV) {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c<a0.x + %u>", reg->num);
+		else
+			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+	} else {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+		else
+			printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+	}
+}
+
+static void
+tab(int lvl)
+{
+	for (int i = 0; i < lvl; i++)
+		printf("\t");
+}
+
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(uint64_t)block;
+#endif
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+	unsigned i;
+
+	tab(lvl);
+
+	print_instr_name(instr);
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		printf(i ? ", " : " ");
+		print_reg_name(reg, !!i);
+	}
+
+	if (instr->address) {
+		printf(", address=_");
+		printf("[");
+		print_instr_name(instr->address);
+		printf("]");
+	}
+
+	if (instr->fanin) {
+		printf(", fanin=_");
+		printf("[");
+		print_instr_name(instr->fanin);
+		printf("]");
+	}
+
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_FO) {
+			printf(", off=%d", instr->fo.off);
+		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+			printf(", aid=%d", instr->fi.aid);
+		}
+	}
+
+	if (is_flow(instr) && instr->cat0.target) {
+		/* the predicate register src is implied: */
+		if (instr->opc == OPC_BR) {
+			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+		}
+		printf(", target=block%u", block_id(instr->cat0.target));
+	}
+
+	printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+	print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+	tab(lvl); printf("block%u {\n", block_id(block));
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		print_instr(instr, lvl+1);
+	}
+	if (block->successors[1]) {
+		/* leading into if/else: */
+		tab(lvl+1);
+		printf("/* succs: if _[");
+		print_instr_name(block->condition);
+		printf("] block%u; else block%u; */\n",
+				block_id(block->successors[0]),
+				block_id(block->successors[1]));
+	} else if (block->successors[0]) {
+		tab(lvl+1);
+		printf("/* succs: block%u; */\n",
+				block_id(block->successors[0]));
+	}
+	tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		print_block(block, 0);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i])
+			continue;
+		printf("out%d: ", i);
+		print_instr(ir->outputs[i], 0);
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index a4235a77a15..e5aba859fab 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -26,284 +26,702 @@
  *    Rob Clark <[email protected]>
  */
 
-#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
 
 #include "ir3.h"
+#include "ir3_compiler.h"
 
 /*
  * Register Assignment:
  *
- * NOTE: currently only works on a single basic block.. need to think
- * about how multiple basic blocks are going to get scheduled.  But
- * I think I want to re-arrange how blocks work, ie. get rid of the
- * block nesting thing..
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
  *
- * NOTE: we could do register coalescing (eliminate moves) as part of
- * the RA step.. OTOH I think we need to do scheduling before register
- * assignment.  And if we remove a mov that effects scheduling (unless
- * we leave a placeholder nop, which seems lame), so I'm not really
- * sure how practical this is to do both in a single stage.  But OTOH
- * I'm not really sure a sane way for the CP stage to realize when it
- * cannot remove a mov due to multi-register constraints..
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
  *
- * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has
- * some ideas to handle array allocation with a more conventional
- * graph coloring algorithm for register assignment, which might be
- * a good alternative to the current algo.  However afaict it cannot
- * handle overlapping arrays, which is a scenario that we have to
- * deal with
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the array is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other array elements just
+ * define the variable some more.)
+ */
+
+static const unsigned class_sizes[] = {
+	1, 2, 3, 4,
+	4 + 4, /* txd + 1d/2d */
+	4 + 6, /* txd + 3d */
+	/* temporary: until we can assign arrays, create classes so we
+	 * can round up array to fit.  NOTE with tgsi arrays should
+	 * really all be multiples of four:
+	 */
+	4 * 4,
+	4 * 8,
+	4 * 16,
+	4 * 32,
+
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+	1, 2, 3, 4,
+};
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
+#define total_class_count (class_count + half_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * (REG_A0 - 1))
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+	struct ra_regs *regs;
+	unsigned int classes[class_count];
+	unsigned int half_classes[half_class_count];
+	/* maps flat virtual register space to base gpr: */
+	uint16_t *ra_reg_to_gpr;
+	/* maps cls,gpr to flat virtual register space: */
+	uint16_t **gpr_to_ra_reg;
+};
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
  */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(void *memctx)
+{
+	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
+	unsigned ra_reg_count, reg, first_half_reg;
+	unsigned int **q_values;
+
+	/* calculate # of regs across all classes: */
+	ra_reg_count = 0;
+	for (unsigned i = 0; i < class_count; i++)
+		ra_reg_count += CLASS_REGS(i);
+	for (unsigned i = 0; i < half_class_count; i++)
+		ra_reg_count += HALF_CLASS_REGS(i);
+
+	/* allocate and populate q_values: */
+	q_values = ralloc_array(set, unsigned *, total_class_count);
+	for (unsigned i = 0; i < class_count; i++) {
+		q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
+
+		/* From register_allocate.c:
+		 *
+		 * q(B,C) (indexed by C, B is this register class) in
+		 * Runeson/Nyström paper.  This is "how many registers of B could
+		 * the worst choice register from C conflict with".
+		 *
+		 * If we just let the register allocation algorithm compute these
+		 * values, is extremely expensive.  However, since all of our
+		 * registers are laid out, we can very easily compute them
+		 * ourselves.  View the register from C as fixed starting at GRF n
+		 * somewhere in the middle, and the register from B as sliding back
+		 * and forth.  Then the first register to conflict from B is the
+		 * one starting at n - class_size[B] + 1 and the last register to
+		 * conflict will start at n + class_size[B] - 1.  Therefore, the
+		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
+		 *
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 * B | | | | | |n| --> | | | | | | |
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 *             +-+-+-+-+-+
+		 * C           |n| | | | |
+		 *             +-+-+-+-+-+
+		 *
+		 * (Idea copied from brw_fs_reg_allocate.cpp)
+		 */
+		for (unsigned j = 0; j < class_count; j++)
+			q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+	}
+
+	for (unsigned i = class_count; i < total_class_count; i++) {
+		q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
+
+		/* see comment above: */
+		for (unsigned j = class_count; j < total_class_count; j++) {
+			q_values[i][j] = half_class_sizes[i - class_count] +
+					half_class_sizes[j - class_count] - 1;
+		}
+	}
 
+	/* allocate the reg-set.. */
+	set->regs = ra_alloc_reg_set(set, ra_reg_count);
+	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+	/* .. and classes */
+	reg = 0;
+	for (unsigned i = 0; i < class_count; i++) {
+		set->classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[i][j] = reg;
+
+			for (unsigned br = j; br < j + class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+			reg++;
+		}
+	}
+
+	first_half_reg = reg;
+
+	for (unsigned i = 0; i < half_class_count; i++) {
+		set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[class_count + i] =
+				ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[class_count + i][j] = reg;
+
+			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+			reg++;
+		}
+	}
+
+	ra_set_finalize(set->regs, q_values);
+
+	ralloc_free(q_values);
+
+	return set;
+}
+
+/* register-assign context, per-shader */
 struct ir3_ra_ctx {
-	struct ir3_block *block;
+	struct ir3 *ir;
 	enum shader_t type;
-	bool frag_coord;
 	bool frag_face;
-	int cnt;
-	bool error;
-	struct {
-		unsigned base;
-		unsigned size;
-	} arrays[MAX_ARRAYS];
+
+	struct ir3_ra_reg_set *set;
+	struct ra_graph *g;
+	unsigned alloc_count;
+	unsigned class_alloc_count[total_class_count];
+	unsigned class_base[total_class_count];
+	unsigned instr_cnt;
+	unsigned *def, *use;     /* def/use table */
 };
 
-#ifdef DEBUG
-#  include "freedreno_util.h"
-#  define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS)
-#else
-#  define ra_debug 0
-#endif
-
-#define ra_dump_list(msg, n) do { \
-		if (ra_debug) { \
-			debug_printf("-- " msg); \
-			ir3_dump_instr_list(n); \
-		} \
-	} while (0)
-
-#define ra_dump_instr(msg, n) do { \
-		if (ra_debug) { \
-			debug_printf(">> " msg); \
-			ir3_dump_instr_single(n); \
-		} \
-	} while (0)
-
-#define ra_assert(ctx, x) do { \
-		debug_assert(x); \
-		if (!(x)) { \
-			debug_printf("RA: failed assert: %s\n", #x); \
-			(ctx)->error = true; \
-		}; \
-	} while (0)
-
-
-/* sorta ugly way to retrofit half-precision support.. rather than
- * passing extra param around, just OR in a high bit.  All the low
- * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
- * will continue to work as long as you don't underflow (and that
- * would go badly anyways).
- */
-#define REG_HALF  0x8000
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
 
-#define REG(n, wm, f) (struct ir3_register){ \
-		.flags  = (f), \
-		.num    = (n), \
-		.wrmask = TGSI_WRITEMASK_ ## wm, \
+static int
+size_to_class(unsigned sz, bool half)
+{
+	if (half) {
+		for (unsigned i = 0; i < half_class_count; i++)
+			if (half_class_sizes[i] >= sz)
+				return i + class_count;
+	} else {
+		for (unsigned i = 0; i < class_count; i++)
+			if (class_sizes[i] >= sz)
+				return i;
 	}
+	debug_assert(0);
+	return -1;
+}
 
-/* check that the register exists, is a GPR and is not special (a0/p0) */
-static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+static bool
+is_temp(struct ir3_register *reg)
 {
-	if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) &&
-			!(instr->regs[n]->flags & IR3_REG_SSA))
-		return instr->regs[n];
-	return NULL;
+	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if (reg->flags & IR3_REG_RELATIV) // TODO
+		return false;
+	if ((reg->num == regid(REG_A0, 0)) ||
+			(reg->num == regid(REG_P0, 0)))
+		return false;
+	return true;
 }
 
-/* figure out if an unassigned src register points back to the instr we
- * are assigning:
- */
-static bool instr_used_by(struct ir3_instruction *instr,
-		struct ir3_register *src)
+static bool
+writes_gpr(struct ir3_instruction *instr)
 {
-	struct ir3_instruction *src_instr = ssa(src);
-	unsigned i;
-	if (instr == src_instr)
-		return true;
-	if (src_instr && is_meta(src_instr))
-		for (i = 1; i < src_instr->regs_count; i++)
-			if (instr_used_by(instr, src_instr->regs[i]))
-				return true;
-
-	return false;
+	if (is_store(instr))
+		return false;
+	/* is dest a normal temp register: */
+	return is_temp(instr->regs[0]);
 }
 
-static bool instr_is_output(struct ir3_instruction *instr)
+static struct ir3_instruction *
+get_definer(struct ir3_instruction *instr, int *sz, int *off)
 {
-	struct ir3_block *block = instr->block;
-	unsigned i;
+	struct ir3_instruction *d = NULL;
+	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
+		/* What about the case where collect is subset of array, we
+		 * need to find the distance between where actual array starts
+		 * and fanin..  that probably doesn't happen currently.
+		 */
+		struct ir3_register *src;
 
-	for (i = 0; i < block->noutputs; i++)
-		if (instr == block->outputs[i])
-			return true;
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
+		 */
+		foreach_src(src, instr) {
+			if (!src->instr)
+				continue;
+			if ((!d) || (src->instr->ip < d->ip))
+				d = src->instr;
+		}
 
-	return false;
-}
+		*sz = instr->regs_count - 1;
+		*off = 0;
 
-static void mark_sources(struct ir3_instruction *instr,
-		struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written)
-{
-	unsigned i;
+	} else if (instr->cp.right || instr->cp.left) {
+		/* covers also the meta:fo case, which ends up w/ single
+		 * scalar instructions for each component:
+		 */
+		struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+		/* by definition, the entire sequence forms one linked list
+		 * of single scalar register nodes (even if some of them may
+		 * be fanouts from a texture sample (for example) instr.  We
+		 * just need to walk the list finding the first element of
+		 * the group defined (lowest ip)
+		 */
+		int cnt = 0;
+
+		d = f;
+		while (f) {
+			if (f->ip < d->ip)
+				d = f;
+			if (f == instr)
+				*off = cnt;
+			f = f->cp.right;
+			cnt++;
+		}
+
+		*sz = cnt;
+
+	} else {
+		/* second case is looking directly at the instruction which
+		 * produces multiple values (eg, texture sample), rather
+		 * than the fanout nodes that point back to that instruction.
+		 * This isn't quite right, because it may be part of a larger
+		 * group, such as:
+		 *
+		 *     sam (f32)(xyzw)r0.x, ...
+		 *     add r1.x, ...
+		 *     add r1.y, ...
+		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+		 *
+		 * need to come up with a better way to handle that case.
+		 */
+		if (instr->address) {
+			*sz = instr->regs[0]->size;
+		} else {
+			*sz = util_last_bit(instr->regs[0]->wrmask);
+		}
+		*off = 0;
+		d = instr;
+	}
+
+	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+		struct ir3_instruction *phi = d->regs[0]->instr;
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(phi, &dsz, &doff);
+
+		*sz = MAX2(*sz, dsz);
+		*off = doff;
+
+		if (dd->ip < d->ip) {
+			d = dd;
+		}
+	}
 
-	for (i = 1; i < n->regs_count; i++) {
-		struct ir3_register *r = reg_check(n, i);
-		if (r)
-			regmask_set_if_not(liveregs, r, written);
+	if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+		/* we have already inserted parallel-copies into
+		 * the phi, so we don't need to chase definers
+		 */
+		struct ir3_register *src;
 
-		/* if any src points back to the instruction(s) in
-		 * the block of neighbors that we are assigning then
-		 * mark any written (clobbered) registers as live:
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
 		 */
-		if (instr_used_by(instr, n->regs[i]))
-			regmask_or(liveregs, liveregs, written);
+		foreach_src(src, d) {
+			if (!src->instr)
+				continue;
+			if (src->instr->ip < d->ip)
+				d = src->instr;
+		}
 	}
 
+	if (is_meta(d) && (d->opc == OPC_META_FO)) {
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+
+		/* by definition, should come before: */
+		debug_assert(dd->ip < d->ip);
+
+		*sz = MAX2(*sz, dsz);
+
+		/* Fanout's are grouped, so *off should already valid */
+
+		d = dd;
+	}
+
+	return d;
 }
 
-/* live means read before written */
-static void compute_liveregs(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, regmask_t *liveregs)
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_block *block = instr->block;
-	struct ir3_instruction *n;
-	regmask_t written;
-	unsigned i;
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *defn;
+		int cls, sz, off;
 
-	regmask_init(&written);
+		ctx->instr_cnt++;
 
-	for (n = instr->next; n; n = n->next) {
-		struct ir3_register *r;
-
-		if (is_meta(n))
+		if (instr->regs_count == 0)
 			continue;
 
-		/* check first src's read: */
-		mark_sources(instr, n, liveregs, &written);
+		if (!writes_gpr(instr))
+			continue;
 
-		/* for instructions that write to an array, we need to
-		 * capture the dependency on the array elements:
-		 */
-		if (n->fanin)
-			mark_sources(instr, n->fanin, liveregs, &written);
+		defn = get_definer(instr, &sz, &off);
 
-		/* meta-instructions don't actually get scheduled,
-		 * so don't let it's write confuse us.. what we
-		 * really care about is when the src to the meta
-		 * instr was written:
-		 */
-		if (is_meta(n))
+		if (defn != instr)
 			continue;
 
-		/* then dst written (if assigned already): */
-		r = reg_check(n, 0);
-		if (r) {
-			/* if an instruction *is* an output, then it is live */
-			if (!instr_is_output(n))
-				regmask_set(&written, r);
+		/* arrays which don't fit in one of the pre-defined class
+		 * sizes are pre-colored:
+		 *
+		 * TODO but we still need to allocate names for them, don't we??
+		 */
+		cls = size_to_class(sz, is_half(defn));
+		if (cls >= 0) {
+			instr->name = ctx->class_alloc_count[cls]++;
+			ctx->alloc_count++;
 		}
-
 	}
+}
 
-	/* be sure to account for output registers too: */
-	for (i = 0; i < block->noutputs; i++) {
-		struct ir3_register *r;
-		if (!block->outputs[i])
-			continue;
-		r = reg_check(block->outputs[i], 0);
-		if (r)
-			regmask_set_if_not(liveregs, r, &written);
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+	ir3_clear_mark(ctx->ir);
+	ir3_count_instructions(ctx->ir);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_name_instructions(ctx, block);
 	}
 
-	/* if instruction is output, we need a reg that isn't written
-	 * before the end.. equiv to the instr_used_by() check above
-	 * in the loop body
-	 * TODO maybe should follow fanin/fanout?
+	/* figure out the base register name for each class.  The
+	 * actual ra name is class_base[cls] + instr->name;
 	 */
-	if (instr_is_output(instr))
-		regmask_or(liveregs, liveregs, &written);
+	ctx->class_base[0] = 0;
+	for (unsigned i = 1; i < total_class_count; i++) {
+		ctx->class_base[i] = ctx->class_base[i-1] +
+				ctx->class_alloc_count[i-1];
+	}
+
+	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+	unsigned name;
+	debug_assert(cls >= 0);
+	name = ctx->class_base[cls] + defn->name;
+	debug_assert(name < ctx->alloc_count);
+	return name;
 }
 
-static int find_available(regmask_t *liveregs, int size, bool half)
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
 {
-	unsigned i;
-	unsigned f = half ? IR3_REG_HALF : 0;
-	for (i = 0; i < MAX_REG - size; i++) {
-		if (!regmask_get(liveregs, &REG(i, X, f))) {
-			unsigned start = i++;
-			for (; (i < MAX_REG) && ((i - start) < size); i++)
-				if (regmask_get(liveregs, &REG(i, X, f)))
-					break;
-			if ((i - start) >= size)
-				return start;
+	ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_ra_block_data *bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+	block->bd = bd;
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *src;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		/* There are a couple special cases to deal with here:
+		 *
+		 * fanout: used to split values from a higher class to a lower
+		 *     class, for example split the results of a texture fetch
+		 *     into individual scalar values;  We skip over these from
+		 *     a 'def' perspective, and for a 'use' we walk the chain
+		 *     up to the defining instruction.
+		 *
+		 * fanin: used to collect values from lower class and assemble
+		 *     them together into a higher class, for example arguments
+		 *     to texture sample instructions;  We consider these to be
+		 *     defined at the earliest fanin source.
+		 *
+		 * phi: used to merge values from different flow control paths
+		 *     to the same reg.  Consider defined at earliest phi src,
+		 *     and update all the other phi src's (which may come later
+		 *     in the program) as users to extend the var's live range.
+		 *
+		 * Most of this, other than phi, is completely handled in the
+		 * get_definer() helper.
+		 *
+		 * In either case, we trace the instruction back to the original
+		 * definer and consider that as the def/use ip.
+		 */
+
+		if (writes_gpr(instr)) {
+			struct ir3_instruction *defn;
+			int cls, sz, off;
+
+			defn = get_definer(instr, &sz, &off);
+			if (defn == instr) {
+				/* arrays which don't fit in one of the pre-defined class
+				 * sizes are pre-colored:
+				 */
+				cls = size_to_class(sz, is_half(defn));
+				if (cls >= 0) {
+					unsigned name = ra_name(ctx, cls, defn);
+
+					ctx->def[name] = defn->ip;
+					ctx->use[name] = defn->ip;
+
+					/* since we are in SSA at this point: */
+					debug_assert(!BITSET_TEST(bd->use, name));
+
+					BITSET_SET(bd->def, name);
+
+					if (is_half(defn)) {
+						ra_set_node_class(ctx->g, name,
+								ctx->set->half_classes[cls - class_count]);
+					} else {
+						ra_set_node_class(ctx->g, name,
+								ctx->set->classes[cls]);
+					}
+
+					/* extend the live range for phi srcs, which may come
+					 * from the bottom of the loop
+					 */
+					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = defn->regs[0]->instr;
+						foreach_ssa_src(src, phi) {
+							/* if src is after phi, then we need to extend
+							 * the liverange to the end of src's block:
+							 */
+							if (src->ip > phi->ip) {
+								struct ir3_instruction *last =
+									list_last_entry(&src->block->instr_list,
+										struct ir3_instruction, node);
+								ctx->use[name] = MAX2(ctx->use[name], last->ip);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		foreach_ssa_src(src, instr) {
+			if (writes_gpr(src)) {
+				struct ir3_instruction *srcdefn;
+				int cls, sz, off;
+
+				srcdefn = get_definer(src, &sz, &off);
+				cls = size_to_class(sz, is_half(srcdefn));
+				if (cls >= 0) {
+					unsigned name = ra_name(ctx, cls, srcdefn);
+					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+					if (!BITSET_TEST(bd->def, name))
+						BITSET_SET(bd->use, name);
+				}
+			}
 		}
 	}
-	assert(0);
-	return -1;
 }
 
-static int alloc_block(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, int size)
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
 {
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_instruction *n;
-	regmask_t liveregs;
-	unsigned name;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	bool progress = false;
 
-	/* should only ever be called w/ head of neighbor list: */
-	debug_assert(!instr->cp.left);
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
 
-	regmask_init(&liveregs);
+		/* update livein: */
+		for (unsigned i = 0; i < bitset_words; i++) {
+			BITSET_WORD new_livein =
+				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
 
-	for (n = instr; n; n = n->cp.right)
-		compute_liveregs(ctx, n, &liveregs);
+			if (new_livein & ~bd->livein[i]) {
+				bd->livein[i] |= new_livein;
+				progress = true;
+			}
+		}
 
-	/* because we do assignment on fanout nodes for wrmask!=0x1, we
-	 * need to handle this special case, where the fanout nodes all
-	 * appear after one or more of the consumers of the src node:
-	 *
-	 *   0098:009: sam _, r2.x
-	 *   0028:010: mul.f r3.z, r4.x, c13.x
-	 *   ; we start assigning here for '0098:009: sam'.. but
-	 *   ; would miss the usage at '0028:010: mul.f'
-	 *   0101:009: _meta:fo _, _[0098:009: sam], off=2
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_FO))
-		compute_liveregs(ctx, instr->regs[1]->instr, &liveregs);
+		/* update liveout: */
+		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+			struct ir3_block *succ = block->successors[j];
+			struct ir3_ra_block_data *succ_bd;
+
+			if (!succ)
+				continue;
 
-	name = find_available(&liveregs, size,
-			!!(dst->flags & IR3_REG_HALF));
+			succ_bd = succ->bd;
 
-	if (dst->flags & IR3_REG_HALF)
-		name |= REG_HALF;
+			for (unsigned i = 0; i < bitset_words; i++) {
+				BITSET_WORD new_liveout =
+					(succ_bd->livein[i] & ~bd->liveout[i]);
 
-	return name;
+				if (new_liveout) {
+					bd->liveout[i] |= new_liveout;
+					progress = true;
+				}
+			}
+		}
+	}
+
+	return progress;
 }
 
-static type_t half_type(type_t type)
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
 {
-	switch (type) {
-	case TYPE_F32: return TYPE_F16;
-	case TYPE_U32: return TYPE_U16;
-	case TYPE_S32: return TYPE_S16;
-	/* instructions may already be fixed up: */
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return type;
-	default:
-		assert(0);
-		return ~0;
+	struct ir3 *ir = ctx->ir;
+
+	/* compute live ranges (use/def) on a block level, also updating
+	 * block's def/use bitmasks (used below to calculate per-block
+	 * livein/liveout):
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ra_block_compute_live_ranges(ctx, block);
+	}
+
+	/* update per-block livein/liveout: */
+	while (ra_compute_livein_liveout(ctx)) {}
+
+	/* extend start/end ranges based on livein/liveout info from cfg: */
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
+
+		for (unsigned i = 0; i < bitset_words; i++) {
+			if (BITSET_TEST(bd->livein, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+			}
+
+			if (BITSET_TEST(bd->liveout, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+			}
+		}
+	}
+
+	/* need to fix things up to keep outputs live: */
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		struct ir3_instruction *instr = ir->outputs[i];
+		struct ir3_instruction *defn;
+		int cls, sz, off;
+
+		defn = get_definer(instr, &sz, &off);
+		cls = size_to_class(sz, is_half(defn));
+		if (cls >= 0) {
+			unsigned name = ra_name(ctx, cls, defn);
+			ctx->use[name] = ctx->instr_cnt;
+		}
+	}
+
+	for (unsigned i = 0; i < ctx->alloc_count; i++) {
+		for (unsigned j = 0; j < ctx->alloc_count; j++) {
+			if (!((ctx->def[i] >= ctx->use[j]) ||
+					(ctx->def[j] >= ctx->use[i]))) {
+				ra_add_node_interference(ctx->g, i, j);
+			}
+		}
 	}
 }
 
@@ -358,302 +776,124 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
-static void reg_assign(struct ir3_instruction *instr,
-		unsigned r, unsigned name)
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+		struct ir3_instruction *instr)
 {
-	struct ir3_register *reg = instr->regs[r];
-
-	reg->flags &= ~IR3_REG_SSA;
-	reg->num = name & ~REG_HALF;
-
-	if (name & REG_HALF) {
-		reg->flags |= IR3_REG_HALF;
-		/* if dst reg being assigned, patch up the instr: */
-		if (reg == instr->regs[0])
-			fixup_half_instr_dst(instr);
-		else
-			fixup_half_instr_src(instr);
-	}
-}
-
-static void instr_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name);
+	struct ir3_instruction *defn;
+	int cls, sz, off;
 
-static void instr_assign_src(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned r, unsigned name)
-{
-	struct ir3_register *reg = instr->regs[r];
+	defn = get_definer(instr, &sz, &off);
+	cls = size_to_class(sz, is_half(defn));
+	if (cls >= 0) {
+		unsigned name = ra_name(ctx, cls, defn);
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
 
-	if (reg->flags & IR3_REG_RELATIV)
-		name += reg->offset;
+		if (reg->flags & IR3_REG_RELATIV)
+			num += reg->offset;
 
-	reg_assign(instr, r, name);
+		reg->num = num;
+		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			/* shader-input does not have a src, only block input: */
-			debug_assert(instr->regs_count == 2);
-			instr_assign(ctx, instr, name);
-			return;
-		case OPC_META_FO:
-			instr_assign(ctx, instr, name + instr->fo.off);
-			return;
-		case OPC_META_FI:
-			instr_assign(ctx, instr, name - (r - 1));
-			return;
-		default:
-			break;
-		}
+		if (is_half(defn))
+			reg->flags |= IR3_REG_HALF;
 	}
 }
 
-static void instr_assign_srcs(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name)
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_instruction *n, *src;
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_register *reg;
 
-	for (n = instr->next; n && !ctx->error; n = n->next) {
-		foreach_ssa_src_n(src, i, n) {
-			unsigned r = i + 1;
-
-			/* skip address / etc (non real sources): */
-			if (r >= n->regs_count)
-				continue;
+		if (instr->regs_count == 0)
+			continue;
 
-			if (src == instr)
-				instr_assign_src(ctx, n, r, name);
+		if (writes_gpr(instr)) {
+			reg_assign(ctx, instr->regs[0], instr);
+			if (instr->regs[0]->flags & IR3_REG_HALF)
+				fixup_half_instr_dst(instr);
 		}
-	}
-}
-
-static void instr_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name)
-{
-	struct ir3_register *reg = instr->regs[0];
-
-	if (reg->flags & IR3_REG_RELATIV)
-		return;
-
-	/* check if already assigned: */
-	if (!(reg->flags & IR3_REG_SSA)) {
-		/* ... and if so, sanity check: */
-		ra_assert(ctx, reg->num == (name & ~REG_HALF));
-		return;
-	}
-
-	/* rename this instructions dst register: */
-	reg_assign(instr, 0, name);
-
-	/* and rename any subsequent use of result of this instr: */
-	instr_assign_srcs(ctx, instr, name);
-
-	/* To simplify the neighbor logic, and to "avoid" dealing with
-	 * instructions which write more than one output, we actually
-	 * do register assignment for instructions that produce multiple
-	 * outputs on the fanout nodes and propagate up the assignment
-	 * to the actual instruction:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
-		struct ir3_instruction *src;
 
-		debug_assert(name >= instr->fo.off);
-
-		foreach_ssa_src(src, instr)
-			instr_assign(ctx, src, name - instr->fo.off);
-	}
-}
+		foreach_src_n(reg, n, instr) {
+			struct ir3_instruction *src = reg->instr;
+			if (!src)
+				continue;
 
-/* check neighbor list to see if it is already partially (or completely)
- * assigned, in which case register block is already allocated and we
- * just need to complete the assignment:
- */
-static int check_partial_assignment(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *n;
-	int off = 0;
-
-	debug_assert(!instr->cp.left);
-
-	for (n = instr; n; n = n->cp.right) {
-		struct ir3_register *dst = n->regs[0];
-		if ((n->depth != DEPTH_UNUSED) &&
-				!(dst->flags & IR3_REG_SSA)) {
-			int name = dst->num - off;
-			debug_assert(name >= 0);
-			return name;
+			reg_assign(ctx, instr->regs[n+1], src);
+			if (instr->regs[n+1]->flags & IR3_REG_HALF)
+				fixup_half_instr_src(instr);
 		}
-		off++;
 	}
-
-	return -1;
 }
 
-/* allocate register name(s) for a list of neighboring instructions;
- * instr should point to leftmost neighbor (head of list)
- */
-static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
 {
-	struct ir3_instruction *n;
-	struct ir3_register *dst;
-	int name;
-
-	debug_assert(!instr->cp.left);
-
-	if (instr->regs_count == 0)
-		return;
-
-	dst = instr->regs[0];
-
-	/* For indirect dst, take the register assignment from the
-	 * fanin and propagate it forward.
-	 */
-	if (dst->flags & IR3_REG_RELATIV) {
-		/* NOTE can be grouped, if for example outputs:
-		 * for now disable cp if indirect writes
-		 */
-		instr_alloc_and_assign(ctx, instr->fanin);
-
-		dst->num += instr->fanin->regs[0]->num;
-		dst->flags &= ~IR3_REG_SSA;
-
-		instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num);
-
-		return;
-	}
-
-	/* for instructions w/ fanouts, do the actual register assignment
-	 * on the group of fanout neighbor nodes and propagate the reg
-	 * name back up to the texture instruction.
-	 */
-	if (dst->wrmask != 0x1)
-		return;
-
-	name = check_partial_assignment(ctx, instr);
-
-	/* allocate register(s): */
-	if (name >= 0) {
-		/* already partially assigned, just finish the job */
-	} else if (reg_gpr(dst)) {
-		int size;
-		/* number of consecutive registers to assign: */
-		size = ir3_neighbor_count(instr);
-		if (dst->wrmask != 0x1)
-			size = MAX2(size, ffs(~dst->wrmask) - 1);
-		name = alloc_block(ctx, instr, size);
-	} else if (dst->flags & IR3_REG_ADDR) {
-		debug_assert(!instr->cp.right);
-		dst->flags &= ~IR3_REG_ADDR;
-		name = regid(REG_A0, 0) | REG_HALF;
-	} else {
-		debug_assert(!instr->cp.right);
-		/* predicate register (p0).. etc */
-		name = regid(REG_P0, 0);
-		debug_assert(dst->num == name);
-	}
-
-	ra_assert(ctx, name >= 0);
-
-	for (n = instr; n && !ctx->error; n = n->cp.right) {
-		instr_assign(ctx, n, name);
-		name++;
-	}
-}
-
-static void instr_assign_array(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-	int name, aid = instr->fi.aid;
-
-	if (ctx->arrays[aid].base == ~0) {
-		int size = instr->regs_count - 1;
-		ctx->arrays[aid].base = alloc_block(ctx, instr, size);
-		ctx->arrays[aid].size = size;
-	}
-
-	name = ctx->arrays[aid].base;
-
-	foreach_ssa_src_n(src, i, instr) {
-		unsigned r = i + 1;
-
-		/* skip address / etc (non real sources): */
-		if (r >= instr->regs_count)
-			break;
-
-		instr_assign(ctx, src, name);
-		name++;
-	}
-
-}
-
-static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	struct ir3_instruction *n;
-
 	/* frag shader inputs get pre-assigned, since we have some
 	 * constraints/unknowns about setup for some of these regs:
 	 */
-	if ((ctx->type == SHADER_FRAGMENT) && !block->parent) {
+	if (ctx->type == SHADER_FRAGMENT) {
+		struct ir3 *ir = ctx->ir;
 		unsigned i = 0, j;
-		if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) {
+		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
+			struct ir3_instruction *instr = ir->inputs[i];
+			int cls = size_to_class(1, true);
+			unsigned name = ra_name(ctx, cls, instr);
+			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
+
 			/* if we have frag_face, it gets hr0.x */
-			instr_assign(ctx, block->inputs[i], REG_HALF | 0);
+			ra_set_node_reg(ctx->g, name, reg);
 			i += 4;
 		}
-		for (j = 0; i < block->ninputs; i++, j++)
-			if (block->inputs[i])
-				instr_assign(ctx, block->inputs[i], j);
-	}
 
-	ra_dump_list("-------\n", block->head);
+		for (j = 0; i < ir->ninputs; i++) {
+			struct ir3_instruction *instr = ir->inputs[i];
+			if (instr) {
+				struct ir3_instruction *defn;
+				int cls, sz, off;
 
-	/* first pass, assign arrays: */
-	for (n = block->head; n && !ctx->error; n = n->next) {
-		if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) {
-			debug_assert(!n->cp.left);  /* don't think this should happen */
-			ra_dump_instr("ASSIGN ARRAY: ", n);
-			instr_assign_array(ctx, n);
-			ra_dump_list("-------\n", block->head);
+				defn = get_definer(instr, &sz, &off);
+				if (defn == instr) {
+					unsigned name, reg;
+
+					cls = size_to_class(sz, is_half(defn));
+					name = ra_name(ctx, cls, defn);
+					reg = ctx->set->gpr_to_ra_reg[cls][j];
+
+					ra_set_node_reg(ctx->g, name, reg);
+					j += sz;
+				}
+			}
 		}
 	}
 
-	for (n = block->head; n && !ctx->error; n = n->next) {
-		ra_dump_instr("ASSIGN: ", n);
-		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
-		ra_dump_list("-------\n", block->head);
+	if (!ra_allocate(ctx->g))
+		return -1;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_alloc(ctx, block);
 	}
 
-	return ctx->error ? -1 : 0;
+	return 0;
 }
 
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+int ir3_ra(struct ir3 *ir, enum shader_t type,
 		bool frag_coord, bool frag_face)
 {
-	struct ir3_instruction *n;
 	struct ir3_ra_ctx ctx = {
-			.block = block,
+			.ir = ir,
 			.type = type,
-			.frag_coord = frag_coord,
 			.frag_face = frag_face,
+			.set = ir->compiler->set,
 	};
 	int ret;
 
-	memset(&ctx.arrays, ~0, sizeof(ctx.arrays));
-
-	/* mark dst registers w/ SSA flag so we can see which
-	 * have been assigned so far:
-	 * NOTE: we really should set SSA flag consistently on
-	 * every dst register in the frontend.
-	 */
-	for (n = block->head; n; n = n->next)
-		if (n->regs_count > 0)
-			n->regs[0]->flags |= IR3_REG_SSA;
-
-	ir3_clear_mark(block->shader);
-	ret = block_ra(&ctx, block);
+	ra_init(&ctx);
+	ra_add_interference(&ctx);
+	ret = ra_alloc(&ctx);
+	ra_destroy(&ctx);
 
 	return ret;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index a790cba129b..49a4426d163 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -31,23 +31,14 @@
 
 #include "ir3.h"
 
-enum {
-	SCHEDULED = -1,
-	DELAYED = -2,
-};
-
 /*
  * Instruction Scheduling:
  *
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path.  The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth.  After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again.  If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's.  Repeat until no more unscheduled
- * instructions.
+ * A priority-queue based scheduling algo.  Add eligible instructions,
+ * ie. ones with all their dependencies scheduled, to the priority
+ * (depth) sorted queue (list).  Pop highest priority instruction off
+ * the queue and schedule it, add newly eligible instructions to the
+ * priority queue, rinse, repeat.
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -60,90 +51,33 @@ enum {
  */
 
 struct ir3_sched_ctx {
-	struct ir3_instruction *scheduled; /* last scheduled instr */
+	struct ir3_block *block;           /* the current block */
+	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
-	unsigned cnt;
 	bool error;
 };
 
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
-
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
-
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
-}
-
-static unsigned distance(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr, unsigned maxd)
-{
-	struct ir3_instruction *n = ctx->scheduled;
-	unsigned d = 0;
-	while (n && (n != instr) && (d < maxd)) {
-		if (is_alu(n) || is_flow(n))
-			d++;
-		n = n->next;
-	}
-	return d;
-}
-
-/* TODO maybe we want double linked list? */
-static struct ir3_instruction * prev(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *p = instr->block->head;
-	while (p && (p->next != instr))
-		p = p->next;
-	return p;
-}
-
 static bool is_sfu_or_mem(struct ir3_instruction *instr)
 {
 	return is_sfu(instr) || is_mem(instr);
 }
 
-static void schedule(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr, bool remove)
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
-	struct ir3_block *block = instr->block;
+	debug_assert(ctx->block == instr->block);
 
 	/* maybe there is a better way to handle this than just stuffing
 	 * a nop.. ideally we'd know about this constraint in the
 	 * scheduling and depth calculation..
 	 */
 	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-		schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+		ir3_NOP(ctx->block);
 
 	/* remove from depth list:
 	 */
-	if (remove) {
-		struct ir3_instruction *p = prev(instr);
-
-		/* NOTE: this can happen for inputs which are not
-		 * read.. in that case there is no need to schedule
-		 * the input, so just bail:
-		 */
-		if (instr != (p ? p->next : block->head))
-			return;
-
-		if (p)
-			p->next = instr->next;
-		else
-			block->head = instr->next;
-	}
+	list_delinit(&instr->node);
 
 	if (writes_addr(instr)) {
 		assert(ctx->addr == NULL);
@@ -157,18 +91,30 @@ static void schedule(struct ir3_sched_ctx *ctx,
 
 	instr->flags |= IR3_INSTR_MARK;
 
-	instr->next = ctx->scheduled;
+	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
-
-	ctx->cnt++;
 }
 
-/*
- * Delay-slot calculation.  Follows fanin/fanout.
- */
+static unsigned
+distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
+		unsigned maxd)
+{
+	struct list_head *instr_list = &ctx->block->instr_list;
+	unsigned d = 0;
+
+	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+		if ((n == instr) || (d >= maxd))
+			break;
+		if (is_alu(n) || is_flow(n))
+			d++;
+	}
+
+	return d;
+}
 
 /* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned srcn)
 {
@@ -177,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 	if (is_meta(assigner)) {
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
-			unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+			unsigned d;
+			if (src->block != assigner->block)
+				break;
+			d = delay_calc_srcn(ctx, src, consumer, srcn);
 			delay = MAX2(delay, d);
 		}
 	} else {
@@ -189,48 +138,87 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 }
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 
 	foreach_ssa_src_n(src, i, instr) {
-		unsigned d = delay_calc_srcn(ctx, src, instr, i);
+		unsigned d;
+		if (src->block != instr->block)
+			continue;
+		d = delay_calc_srcn(ctx, src, instr, i);
 		delay = MAX2(delay, d);
 	}
 
 	return delay;
 }
 
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+	/* there is at least one kill which could be scheduled, except
+	 * for unscheduled bary.f's:
+	 */
+	bool blocked_kill;
+	/* there is at least one instruction that could be scheduled,
+	 * except for conflicting address/predicate register usage:
+	 */
+	bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+	return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+static bool
+check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		assert(ctx->addr != instr);
+		notes->addr_conflict = true;
+		return true;
+	}
+
+	if (writes_pred(instr) && ctx->pred) {
+		assert(ctx->pred != instr);
+		notes->pred_conflict = true;
+		return true;
+	}
+
+	return false;
+}
+
+/* is this instruction ready to be scheduled?  Return negative for not
+ * ready (updating notes if needed), or >= 0 to indicate number of
+ * delay slots needed.
  */
-static int trysched(struct ir3_sched_ctx *ctx,
+static int
+instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *srcs[64];
 	struct ir3_instruction *src;
-	unsigned delay, nsrcs = 0;
+	unsigned delay = 0;
 
-	/* if already scheduled: */
-	if (instr->flags & IR3_INSTR_MARK)
+	/* Phi instructions can have a dependency on something not
+	 * scheduled yet (for ex, loops).  But OTOH we don't really
+	 * care.  By definition phi's should appear at the top of
+	 * the block, and it's sources should be values from the
+	 * previously executing block, so they are always ready to
+	 * be scheduled:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
 		return 0;
 
-	/* figure out our src's, copy 'em out into an array for sorting: */
 	foreach_ssa_src(src, instr) {
-		debug_assert(nsrcs < ARRAY_SIZE(srcs));
-		srcs[nsrcs++] = src;
-	}
-
-	/* for each src register in sorted order:
-	 */
-	delay = 0;
-	while ((src = deepest(srcs, nsrcs))) {
-		delay = trysched(ctx, src);
-		if (delay)
-			return delay;
+		/* if dependency not scheduled, we aren't ready yet: */
+		if (!is_scheduled(src))
+			return -1;
 	}
 
 	/* all our dependents are scheduled, figure out if
@@ -255,216 +243,276 @@ static int trysched(struct ir3_sched_ctx *ctx,
 	 */
 	if (is_kill(instr)) {
 		struct ir3 *ir = instr->block->shader;
-		unsigned i;
 
-		for (i = 0; i < ir->baryfs_count; i++) {
+		for (unsigned i = 0; i < ir->baryfs_count; i++) {
 			struct ir3_instruction *baryf = ir->baryfs[i];
 			if (baryf->depth == DEPTH_UNUSED)
 				continue;
-			delay = trysched(ctx, baryf);
-			if (delay)
-				return delay;
+			if (!is_scheduled(baryf)) {
+				notes->blocked_kill = true;
+				return -1;
+			}
 		}
 	}
 
-	/* if this is a write to address/predicate register, and that
-	 * register is currently in use, we need to defer until it is
-	 * free:
-	 */
-	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
-		return DELAYED;
-	}
-	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
-		return DELAYED;
-	}
+	if (check_conflict(ctx, notes, instr))
+		return -1;
 
-	schedule(ctx, instr, true);
-	return SCHEDULED;
+	return 0;
 }
 
-static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+/* move eligible instructions to the priority list: */
+static unsigned
+add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct list_head *prio_queue, struct list_head *unscheduled_list)
 {
-	struct ir3_instruction *reversed = NULL;
-	while (instr) {
-		struct ir3_instruction *next = instr->next;
-		instr->next = reversed;
-		reversed = instr;
-		instr = next;
+	unsigned min_delay = ~0;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
+		int e = instr_eligibility(ctx, notes, instr);
+		if (e < 0)
+			continue;
+		min_delay = MIN2(min_delay, e);
+		if (e == 0) {
+			/* remove from unscheduled list and into priority queue: */
+			list_delinit(&instr->node);
+			ir3_insert_by_depth(instr, prio_queue);
+		}
 	}
-	return reversed;
-}
 
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	return instr->address && (ctx->addr == instr->address);
+	return min_delay;
 }
 
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static void
+split_addr(struct ir3_sched_ctx *ctx)
 {
-	struct ir3_instruction *src;
-	foreach_ssa_src(src, instr)
-		if (ctx->pred == src)
-			return true;
-	return false;
+	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3_instruction *new_addr = NULL;
+	unsigned i;
+
+	debug_assert(ctx->addr);
+
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *indirect = ir->indirects[i];
+
+		/* skip instructions already scheduled: */
+		if (indirect->flags & IR3_INSTR_MARK)
+			continue;
+
+		/* remap remaining instructions using current addr
+		 * to new addr:
+		 */
+		if (indirect->address == ctx->addr) {
+			if (!new_addr) {
+				new_addr = ir3_instr_clone(ctx->addr);
+				/* original addr is scheduled, but new one isn't: */
+				new_addr->flags &= ~IR3_INSTR_MARK;
+			}
+			indirect->address = new_addr;
+		}
+	}
+
+	/* all remaining indirects remapped to new addr: */
+	ctx->addr = NULL;
 }
 
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
  */
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
-		struct ir3_block *block)
+static void
+split_pred(struct ir3_sched_ctx *ctx)
 {
-	struct ir3_instruction *instr = block->head;
-	bool addr_in_use = false;
-	bool pred_in_use = false;
-	bool all_delayed = true;
-	unsigned cnt = ~0, attempted = 0;
-
-	while (instr) {
-		struct ir3_instruction *next = instr->next;
-		bool addr = uses_current_addr(ctx, instr);
-		bool pred = uses_current_pred(ctx, instr);
-
-		if (addr || pred) {
-			int ret = trysched(ctx, instr);
-
-			if (ret != DELAYED)
-				all_delayed = false;
-
-			if (ret == SCHEDULED)
-				cnt = 0;
-			else if (ret > 0)
-				cnt = MIN2(cnt, ret);
-			if (addr)
-				addr_in_use = true;
-			if (pred)
-				pred_in_use = true;
-
-			attempted++;
-		}
+	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3_instruction *new_pred = NULL;
+	unsigned i;
 
-		instr = next;
-	}
+	debug_assert(ctx->pred);
 
-	if (!addr_in_use)
-		ctx->addr = NULL;
+	for (i = 0; i < ir->predicates_count; i++) {
+		struct ir3_instruction *predicated = ir->predicates[i];
 
-	if (!pred_in_use)
-		ctx->pred = NULL;
+		/* skip instructions already scheduled: */
+		if (predicated->flags & IR3_INSTR_MARK)
+			continue;
 
-	/* detect if we've gotten ourselves into an impossible situation
-	 * and bail if needed
-	 */
-	if (all_delayed && (attempted > 0)) {
-		if (pred_in_use) {
-			/* TODO we probably need to keep a list of instructions
-			 * that reference predicate, similar to indirects
-			 */
-			ctx->error = true;
-			return DELAYED;
-		}
-		if (addr_in_use) {
-			struct ir3 *ir = ctx->addr->block->shader;
-			struct ir3_instruction *new_addr =
-					ir3_instr_clone(ctx->addr);
-			unsigned i;
-
-			/* original addr is scheduled, but new one isn't: */
-			new_addr->flags &= ~IR3_INSTR_MARK;
-
-			for (i = 0; i < ir->indirects_count; i++) {
-				struct ir3_instruction *indirect = ir->indirects[i];
-
-				/* skip instructions already scheduled: */
-				if (indirect->flags & IR3_INSTR_MARK)
-					continue;
-
-				/* remap remaining instructions using current addr
-				 * to new addr:
-				 */
-				if (indirect->address == ctx->addr)
-					indirect->address = new_addr;
+		/* remap remaining instructions using current pred
+		 * to new pred:
+		 *
+		 * TODO is there ever a case when pred isn't first
+		 * (and only) src?
+		 */
+		if (ssa(predicated->regs[1]) == ctx->pred) {
+			if (!new_pred) {
+				new_pred = ir3_instr_clone(ctx->pred);
+				/* original pred is scheduled, but new one isn't: */
+				new_pred->flags &= ~IR3_INSTR_MARK;
 			}
-
-			/* all remaining indirects remapped to new addr: */
-			ctx->addr = NULL;
-
-			/* not really, but this will trigger us to go back to
-			 * main trysched() loop now that we've resolved the
-			 * conflict by duplicating the instr that writes to
-			 * the address register.
-			 */
-			return SCHEDULED;
+			predicated->regs[1]->instr = new_pred;
 		}
 	}
 
-	return cnt;
+	/* all remaining predicated remapped to new pred: */
+	ctx->pred = NULL;
 }
 
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_instruction *instr;
+	struct list_head unscheduled_list, prio_queue;
 
-	/* schedule all the shader input's (meta-instr) first so that
-	 * the RA step sees that the input registers contain a value
-	 * from the start of the shader:
+	ctx->block = block;
+
+	/* move all instructions to the unscheduled list, and
+	 * empty the block's instruction list (to which we will
+	 * be inserting.
 	 */
-	if (!block->parent) {
-		unsigned i;
-		for (i = 0; i < block->ninputs; i++) {
-			struct ir3_instruction *in = block->inputs[i];
-			if (in)
-				schedule(ctx, in, true);
+	list_replace(&block->instr_list, &unscheduled_list);
+	list_inithead(&block->instr_list);
+	list_inithead(&prio_queue);
+
+	/* first a pre-pass to schedule all meta:input/phi instructions
+	 * (which need to appear first so that RA knows the register is
+	 * occupied:
+	 */
+	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
+				(instr->opc == OPC_META_PHI)))
+			schedule(ctx, instr);
+	}
+
+	while (!(list_empty(&unscheduled_list) &&
+			list_empty(&prio_queue))) {
+		struct ir3_sched_notes notes = {0};
+		unsigned delay;
+
+		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
+
+		if (!list_empty(&prio_queue)) {
+			struct ir3_instruction *instr = list_last_entry(&prio_queue,
+					struct ir3_instruction, node);
+			/* ugg, this is a bit ugly, but between the time when
+			 * the instruction became eligible and now, a new
+			 * conflict may have arose..
+			 */
+			if (check_conflict(ctx, &notes, instr)) {
+				list_del(&instr->node);
+				list_addtail(&instr->node, &unscheduled_list);
+				continue;
+			}
+
+			schedule(ctx, instr);
+		} else if (delay == ~0) {
+			/* nothing available to schedule.. if we are blocked on
+			 * address/predicate register conflict, then break the
+			 * deadlock by cloning the instruction that wrote that
+			 * reg:
+			 */
+			if (notes.addr_conflict) {
+				split_addr(ctx);
+			} else if (notes.pred_conflict) {
+				split_pred(ctx);
+			} else {
+				debug_assert(0);
+				ctx->error = true;
+				return;
+			}
+		} else {
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
 		}
 	}
 
-	while ((instr = block->head) && !ctx->error) {
-		/* NOTE: always grab next *before* trysched(), in case the
-		 * instruction is actually scheduled (and therefore moved
-		 * from depth list into scheduled list)
-		 */
-		struct ir3_instruction *next = instr->next;
-		int cnt = trysched(ctx, instr);
+	/* And lastly, insert branch/jump instructions to take us to
+	 * the next block.  Later we'll strip back out the branches
+	 * that simply jump to next instruction.
+	 */
+	if (block->successors[1]) {
+		/* if/else, conditional branches to "then" or "else": */
+		struct ir3_instruction *br;
+		unsigned delay = 6;
 
-		if (cnt == DELAYED)
-			cnt = block_sched_undelayed(ctx, block);
+		debug_assert(ctx->pred);
+		debug_assert(block->condition);
 
-		/* -1 is signal to return up stack, but to us means same as 0: */
-		cnt = MAX2(0, cnt);
-		cnt += ctx->cnt;
-		instr = next;
+		delay -= distance(ctx, ctx->pred, delay);
 
-		/* if deepest remaining instruction cannot be scheduled, try
-		 * the increasingly more shallow instructions until needed
-		 * number of delay slots is filled:
-		 */
-		while (instr && (cnt > ctx->cnt)) {
-			next = instr->next;
-			trysched(ctx, instr);
-			instr = next;
+		while (delay > 0) {
+			ir3_NOP(block);
+			delay--;
 		}
 
-		/* and if we run out of instructions that can be scheduled,
-		 * then it is time for nop's:
+		/* create "else" branch first (since "then" block should
+		 * frequently/always end up being a fall-thru):
+		 */
+		br = ir3_BR(block);
+		br->cat0.inv = true;
+		br->cat0.target = block->successors[1];
+
+		/* NOTE: we have to hard code delay of 6 above, since
+		 * we want to insert the nop's before constructing the
+		 * branch.  Throw in an assert so we notice if this
+		 * ever breaks on future generation:
 		 */
-		while (cnt > ctx->cnt)
-			schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+		br = ir3_BR(block);
+		br->cat0.target = block->successors[0];
+
+	} else if (block->successors[0]) {
+		/* otherwise unconditional jump to next block: */
+		struct ir3_instruction *jmp;
+
+		jmp = ir3_JUMP(block);
+		jmp->cat0.target = block->successors[0];
 	}
 
-	/* at this point, scheduled list is in reverse order, so fix that: */
-	block->head = reverse(ctx->scheduled);
+	/* NOTE: if we kept track of the predecessors, we could do a better
+	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+	 * Note that as we eliminate blocks which contain only an unconditional
+	 * jump we probably need to propagate (jp) flag..
+	 */
 }
 
-int ir3_block_sched(struct ir3_block *block)
+/* this is needed to ensure later RA stage succeeds: */
+static void
+sched_insert_parallel_copies(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+			struct ir3_register *reg;
+			foreach_src(reg, instr) {
+				struct ir3_instruction *src = reg->instr;
+				struct ir3_instruction *mov =
+					ir3_MOV(src->block, src, TYPE_U32);
+				mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+				mov->regs[0]->instr = instr;
+				reg->instr = mov;
+			}
+		}
+	}
+}
+
+int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
-	ir3_clear_mark(block->shader);
-	block_sched(&ctx, block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_insert_parallel_copies(block);
+	}
+	ir3_clear_mark(ir);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_block(&ctx, block);
+	}
 	if (ctx.error)
 		return -1;
 	return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 9bf4e64c7f1..b5b038100cc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -127,7 +127,7 @@ static void
 assemble_variant(struct ir3_shader_variant *v)
 {
 	struct fd_context *ctx = fd_context(v->shader->pctx);
-	uint32_t gpu_id = ir3_shader_gpuid(v->shader);
+	uint32_t gpu_id = v->shader->compiler->gpu_id;
 	uint32_t sz, *bin;
 
 	bin = ir3_shader_assemble(v, gpu_id);
@@ -146,17 +146,6 @@ assemble_variant(struct ir3_shader_variant *v)
 	v->ir = NULL;
 }
 
-/* reset before attempting to compile again.. */
-static void reset_variant(struct ir3_shader_variant *v, const char *msg)
-{
-	debug_error(msg);
-	v->inputs_count = 0;
-	v->outputs_count = 0;
-	v->total_in = 0;
-	v->has_samp = false;
-	v->immediates_count = 0;
-}
-
 static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
@@ -177,22 +166,7 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 		tgsi_dump(tokens, 0);
 	}
 
-	if (fd_mesa_debug & FD_DBG_NIR) {
-		ret = ir3_compile_shader_nir(v, tokens, key);
-		if (ret)
-			reset_variant(v, "NIR compiler failed, fallback to TGSI!");
-	} else {
-		ret = -1;
-	}
-
-	if (ret) {
-		ret = ir3_compile_shader(v, tokens, key, true);
-		if (ret) {
-			reset_variant(v, "new compiler failed, trying without copy propagation!");
-			ret = ir3_compile_shader(v, tokens, key, false);
-		}
-	}
-
+	ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key);
 	if (ret) {
 		debug_error("compile failed!");
 		goto fail;
@@ -217,13 +191,6 @@ fail:
 	return NULL;
 }
 
-uint32_t
-ir3_shader_gpuid(struct ir3_shader *shader)
-{
-	struct fd_context *ctx = fd_context(shader->pctx);
-	return ctx->screen->gpu_id;
-}
-
 struct ir3_shader_variant *
 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
@@ -286,6 +253,7 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+	shader->compiler = fd_context(pctx)->screen->compiler;
 	shader->pctx = pctx;
 	shader->type = type;
 	shader->tokens = tgsi_dup_tokens(tokens);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index e5410bf88b2..9f1b0769180 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -86,10 +86,6 @@ struct ir3_shader_key {
 	 * shader:
 	 */
 	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
-
-	/* bitmask of sampler which produces integer outputs:
-	 */
-	uint16_t vinteger_s, finteger_s;
 };
 
 static inline bool
@@ -196,6 +192,8 @@ struct ir3_shader_variant {
 struct ir3_shader {
 	enum shader_t type;
 
+	struct ir3_compiler *compiler;
+
 	struct pipe_context *pctx;
 	const struct tgsi_token *tokens;
 
@@ -212,7 +210,6 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
 		const struct tgsi_token *tokens, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
-uint32_t ir3_shader_gpuid(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
 
@@ -220,6 +217,8 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
  * Helper/util:
  */
 
+#include "pipe/p_shader_tokens.h"
+
 static inline int
 ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
 {
diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
index e0134a7c4ee..83bb64918d4 100644
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -552,7 +552,7 @@ static boolean i915_fpc_useless_mov(union tgsi_full_token *tgsi_current)
    if ( current.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION  &&
         current.FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV &&
         op_has_dst(current.FullInstruction.Instruction.Opcode) &&
-        current.FullInstruction.Instruction.Saturate == TGSI_SAT_NONE &&
+        !current.FullInstruction.Instruction.Saturate &&
         current.FullInstruction.Src[0].Register.Absolute == 0 &&
         current.FullInstruction.Src[0].Register.Negate == 0 &&
         is_unswizzled(&current.FullInstruction.Src[0], current.FullInstruction.Dst[0].Register.WriteMask) &&
@@ -582,7 +582,7 @@ static void i915_fpc_optimize_useless_mov_after_inst(struct i915_optimize_contex
         next->Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION  &&
         next->FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV &&
         op_has_dst(current->FullInstruction.Instruction.Opcode) &&
-        next->FullInstruction.Instruction.Saturate == TGSI_SAT_NONE &&
+        !next->FullInstruction.Instruction.Saturate &&
         next->FullInstruction.Src[0].Register.Absolute == 0 &&
         next->FullInstruction.Src[0].Register.Negate == 0 &&
         unused_from(ctx, &current->FullInstruction.Dst[0], index) &&
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index b74f8239bb4..38a33888166 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -329,7 +329,7 @@ get_result_flags(const struct i915_full_instruction *inst)
       = inst->Dst[0].Register.WriteMask;
    uint flags = 0x0;
 
-   if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+   if (inst->Instruction.Saturate)
       flags |= A0_DEST_SATURATE;
 
    if (writeMask & TGSI_WRITEMASK_X)
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 7216160bb22..0590da07b9a 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -165,6 +165,7 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
@@ -241,6 +242,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index 91a6f65f2e9..e1bbb9a0781 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -15,14 +15,34 @@ C_SOURCES := \
 	core/ilo_debug.h \
 	core/ilo_dev.c \
 	core/ilo_dev.h \
-	core/ilo_format.c \
-	core/ilo_format.h \
-	core/ilo_fence.h \
 	core/ilo_image.c \
 	core/ilo_image.h \
-	core/ilo_state_3d.h \
-	core/ilo_state_3d_bottom.c \
-	core/ilo_state_3d_top.c \
+	core/ilo_state_cc.c \
+	core/ilo_state_cc.h \
+	core/ilo_state_compute.c \
+	core/ilo_state_compute.h \
+	core/ilo_state_raster.c \
+	core/ilo_state_raster.h \
+	core/ilo_state_sampler.c \
+	core/ilo_state_sampler.h \
+	core/ilo_state_sbe.c \
+	core/ilo_state_sbe.h \
+	core/ilo_state_shader.c \
+	core/ilo_state_shader_ps.c \
+	core/ilo_state_shader.h \
+	core/ilo_state_sol.c \
+	core/ilo_state_sol.h \
+	core/ilo_state_surface.c \
+	core/ilo_state_surface_format.c \
+	core/ilo_state_surface.h \
+	core/ilo_state_urb.c \
+	core/ilo_state_urb.h \
+	core/ilo_state_vf.c \
+	core/ilo_state_vf.h \
+	core/ilo_state_viewport.c \
+	core/ilo_state_viewport.h \
+	core/ilo_state_zs.c \
+	core/ilo_state_zs.h \
 	core/intel_winsys.h \
 	ilo_blit.c \
 	ilo_blit.h \
@@ -38,6 +58,8 @@ C_SOURCES := \
 	ilo_cp.h \
 	ilo_draw.c \
 	ilo_draw.h \
+	ilo_format.c \
+	ilo_format.h \
 	ilo_gpgpu.c \
 	ilo_gpgpu.h \
 	ilo_public.h \
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_buffer.h
index 50f97d10bd7..ca3c61ff890 100644
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ b/src/gallium/drivers/ilo/core/ilo_buffer.h
@@ -31,11 +31,13 @@
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
+#include "ilo_debug.h"
 #include "ilo_dev.h"
 
 struct ilo_buffer {
    unsigned bo_size;
 
+   /* managed by users */
    struct intel_bo *bo;
 };
 
@@ -43,6 +45,8 @@ static inline void
 ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
                 unsigned size, uint32_t bind, uint32_t flags)
 {
+   assert(ilo_is_zeroed(buf, sizeof(*buf)));
+
    buf->bo_size = size;
 
    /*
@@ -55,36 +59,6 @@ ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
     */
    if (bind & PIPE_BIND_SAMPLER_VIEW)
       buf->bo_size = align(buf->bo_size, 256) + 16;
-
-   if ((bind & PIPE_BIND_VERTEX_BUFFER) && ilo_dev_gen(dev) < ILO_GEN(7.5)) {
-      /*
-       * As noted in ilo_format_translate(), we treat some 3-component formats
-       * as 4-component formats to work around hardware limitations.  Imagine
-       * the case where the vertex buffer holds a single
-       * PIPE_FORMAT_R16G16B16_FLOAT vertex, and buf->bo_size is 6.  The
-       * hardware would fail to fetch it at boundary check because the vertex
-       * buffer is expected to hold a PIPE_FORMAT_R16G16B16A16_FLOAT vertex
-       * and that takes at least 8 bytes.
-       *
-       * For the workaround to work, we should add 2 to the bo size.  But that
-       * would waste a page when the bo size is already page aligned.  Let's
-       * round it to page size for now and revisit this when needed.
-       */
-      buf->bo_size = align(buf->bo_size, 4096);
-   }
-}
-
-static inline void
-ilo_buffer_cleanup(struct ilo_buffer *buf)
-{
-   intel_bo_unref(buf->bo);
-}
-
-static inline void
-ilo_buffer_set_bo(struct ilo_buffer *buf, struct intel_bo *bo)
-{
-   intel_bo_unref(buf->bo);
-   buf->bo = intel_bo_ref(bo);
 }
 
 #endif /* ILO_BUFFER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 3c5eef9bcbc..4e05a3aca1e 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
 {
    int i;
 
-   memset(builder, 0, sizeof(*builder));
+   assert(ilo_is_zeroed(builder, sizeof(*builder)));
 
    builder->dev = dev;
    builder->winsys = winsys;
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d.h b/src/gallium/drivers/ilo/core/ilo_builder_3d.h
index 6cf1732ee1c..fb8b53cbe23 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d.h
@@ -35,45 +35,45 @@
 #include "ilo_builder_3d_top.h"
 #include "ilo_builder_3d_bottom.h"
 
+struct gen6_3dprimitive_info {
+   enum gen_3dprim_type topology;
+   bool indexed;
+
+   uint32_t vertex_count;
+   uint32_t vertex_start;
+   uint32_t instance_count;
+   uint32_t instance_start;
+   int32_t vertex_base;
+};
+
 static inline void
 gen6_3DPRIMITIVE(struct ilo_builder *builder,
-                 const struct pipe_draw_info *info,
-                 const struct ilo_ib_state *ib)
+                 const struct gen6_3dprimitive_info *info)
 {
    const uint8_t cmd_len = 6;
-   const int prim = gen6_3d_translate_pipe_prim(info->mode);
-   const int vb_access = (info->indexed) ?
-      GEN6_3DPRIM_DW0_ACCESS_RANDOM : GEN6_3DPRIM_DW0_ACCESS_SEQUENTIAL;
-   const uint32_t vb_start = info->start +
-      ((info->indexed) ? ib->draw_start_offset : 0);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) |
-           vb_access |
-           prim << GEN6_3DPRIM_DW0_TYPE__SHIFT |
-           (cmd_len - 2);
-   dw[1] = info->count;
-   dw[2] = vb_start;
+   dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2) |
+           info->topology << GEN6_3DPRIM_DW0_TYPE__SHIFT;
+   if (info->indexed)
+      dw[0] |= GEN6_3DPRIM_DW0_ACCESS_RANDOM;
+
+   dw[1] = info->vertex_count;
+   dw[2] = info->vertex_start;
    dw[3] = info->instance_count;
-   dw[4] = info->start_instance;
-   dw[5] = info->index_bias;
+   dw[4] = info->instance_start;
+   dw[5] = info->vertex_base;
 }
 
 static inline void
 gen7_3DPRIMITIVE(struct ilo_builder *builder,
-                 const struct pipe_draw_info *info,
-                 const struct ilo_ib_state *ib)
+                 const struct gen6_3dprimitive_info *info)
 {
    const uint8_t cmd_len = 7;
-   const int prim = gen6_3d_translate_pipe_prim(info->mode);
-   const int vb_access = (info->indexed) ?
-      GEN7_3DPRIM_DW1_ACCESS_RANDOM : GEN7_3DPRIM_DW1_ACCESS_SEQUENTIAL;
-   const uint32_t vb_start = info->start +
-      ((info->indexed) ? ib->draw_start_offset : 0);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
@@ -81,12 +81,16 @@ gen7_3DPRIMITIVE(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2);
-   dw[1] = vb_access | prim;
-   dw[2] = info->count;
-   dw[3] = vb_start;
+
+   dw[1] = info->topology << GEN7_3DPRIM_DW1_TYPE__SHIFT;
+   if (info->indexed)
+      dw[1] |= GEN7_3DPRIM_DW1_ACCESS_RANDOM;
+
+   dw[2] = info->vertex_count;
+   dw[3] = info->vertex_start;
    dw[4] = info->instance_count;
-   dw[5] = info->start_instance;
-   dw[6] = info->index_bias;
+   dw[5] = info->instance_start;
+   dw[6] = info->vertex_base;
 }
 
 #endif /* ILO_BUILDER_3D_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 16ec4afd15b..6d9e3699125 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -29,335 +29,121 @@
 #define ILO_BUILDER_3D_BOTTOM_H
 
 #include "genhw/genhw.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
-#include "ilo_format.h"
+#include "ilo_state_cc.h"
+#include "ilo_state_raster.h"
+#include "ilo_state_sbe.h"
+#include "ilo_state_shader.h"
+#include "ilo_state_viewport.h"
+#include "ilo_state_zs.h"
 #include "ilo_builder.h"
 #include "ilo_builder_3d_top.h"
 
 static inline void
 gen6_3DSTATE_CLIP(struct ilo_builder *builder,
-                  const struct ilo_rasterizer_state *rasterizer,
-                  const struct ilo_shader_state *fs,
-                  bool enable_guardband,
-                  int num_viewports)
-{
-   const uint8_t cmd_len = 4;
-   uint32_t dw1, dw2, dw3, *dw;
-   int interps;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   dw1 = rasterizer->clip.payload[0];
-   dw2 = rasterizer->clip.payload[1];
-   dw3 = rasterizer->clip.payload[2];
-
-   if (enable_guardband && rasterizer->clip.can_enable_guardband)
-      dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE;
-
-   interps = (fs) ?  ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) : 0;
-
-   if (interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL |
-                  GEN6_INTERP_NONPERSPECTIVE_CENTROID |
-                  GEN6_INTERP_NONPERSPECTIVE_SAMPLE))
-      dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE;
-
-   dw3 |= GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO |
-          (num_viewports - 1);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
-   dw[3] = dw3;
-}
-
-static inline void
-gen6_disable_3DSTATE_CLIP(struct ilo_builder *builder)
+                  const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-}
-
-static inline void
-gen7_internal_3dstate_sf(struct ilo_builder *builder,
-                         uint8_t cmd_len, uint32_t *dw,
-                         const struct ilo_rasterizer_sf *sf,
-                         int num_samples)
-{
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
-
-   assert(cmd_len == 7);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
-
-   if (!sf) {
-      dw[1] = 0;
-      dw[2] = (num_samples > 1) ? GEN7_SF_DW2_MSRASTMODE_ON_PATTERN : 0;
-      dw[3] = 0;
-      dw[4] = 0;
-      dw[5] = 0;
-      dw[6] = 0;
-
-      return;
-   }
-
-   /* see rasterizer_init_sf_gen6() */
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   dw[1] = sf->payload[0];
-   dw[2] = sf->payload[1];
-   dw[3] = sf->payload[2];
-
-   if (num_samples > 1)
-      dw[2] |= sf->dw_msaa;
-
-   dw[4] = sf->dw_depth_offset_const;
-   dw[5] = sf->dw_depth_offset_scale;
-   dw[6] = sf->dw_depth_offset_clamp;
-}
-
-static inline void
-gen8_internal_3dstate_sbe(struct ilo_builder *builder,
-                          uint8_t cmd_len, uint32_t *dw,
-                          const struct ilo_shader_state *fs,
-                          int sprite_coord_mode)
-{
-   const struct ilo_kernel_routing *routing;
-   int vue_offset, vue_len, out_count;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   assert(cmd_len == 4);
-
-   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
-
-   if (!fs) {
-      dw[1] = 1 << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
-      dw[2] = 0;
-      dw[3] = 0;
-      return;
-   }
-
-   routing = ilo_shader_get_kernel_routing(fs);
-
-   vue_offset = routing->source_skip;
-   assert(vue_offset % 2 == 0);
-   vue_offset /= 2;
-
-   vue_len = (routing->source_len + 1) / 2;
-   if (!vue_len)
-      vue_len = 1;
-
-   out_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT);
-   assert(out_count <= 32);
-
-   dw[1] = out_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT |
-           vue_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= GEN8_SBE_DW1_USE_URB_READ_LEN |
-               GEN8_SBE_DW1_USE_URB_READ_OFFSET |
-               vue_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT;
-   } else {
-      dw[1] |= vue_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
-   }
-
-   if (routing->swizzle_enable)
-      dw[1] |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE;
-
-   switch (sprite_coord_mode) {
-   case PIPE_SPRITE_COORD_UPPER_LEFT:
-      dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT;
-      break;
-   case PIPE_SPRITE_COORD_LOWER_LEFT:
-      dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT;
-      break;
-   }
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 268:
-    *
-    *     "This field (Point Sprite Texture Coordinate Enable) must be
-    *      programmed to 0 when non-point primitives are rendered."
-    *
-    * TODO We do not check that yet.
-    */
-   dw[2] = routing->point_sprite_enable;
-
-   dw[3] = routing->const_interp_enable;
-}
-
-static inline void
-gen8_internal_3dstate_sbe_swiz(struct ilo_builder *builder,
-                               uint8_t cmd_len, uint32_t *dw,
-                               const struct ilo_shader_state *fs)
-{
-   const struct ilo_kernel_routing *routing;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   assert(cmd_len == 11);
-
-   dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2);
-
-   if (!fs) {
-      memset(&dw[1], 0, sizeof(*dw) * (cmd_len - 1));
-      return;
-   }
-
-   routing = ilo_shader_get_kernel_routing(fs);
-
-   STATIC_ASSERT(sizeof(routing->swizzles) >= sizeof(*dw) * 8);
-   memcpy(&dw[1], routing->swizzles, sizeof(*dw) * 8);
-
-   /* WrapShortest enables */
-   dw[9] = 0;
-   dw[10] = 0;
+   /* see raster_set_gen6_3DSTATE_CLIP() */
+   dw[1] = rs->clip[0];
+   dw[2] = rs->clip[1];
+   dw[3] = rs->clip[2];
 }
 
 static inline void
 gen6_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_state *rasterizer,
-                const struct ilo_shader_state *fs,
-                int sample_count)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 20;
-   uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11];
-   uint32_t gen7_3dstate_sf[7];
-   const struct ilo_rasterizer_sf *sf;
-   int sprite_coord_mode;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   sf = (rasterizer) ? &rasterizer->sf : NULL;
-   sprite_coord_mode = (rasterizer) ? rasterizer->state.sprite_coord_mode : 0;
-
-   gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe),
-         gen8_3dstate_sbe, fs, sprite_coord_mode);
-   gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz),
-         gen8_3dstate_sbe_swiz, fs);
-   gen7_internal_3dstate_sf(builder, Elements(gen7_3dstate_sf),
-         gen7_3dstate_sf, sf, sample_count);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
-   dw[1] = gen8_3dstate_sbe[1];
-   memcpy(&dw[2], &gen7_3dstate_sf[1], sizeof(*dw) * 6);
-   memcpy(&dw[8], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8);
-   dw[16] = gen8_3dstate_sbe[2];
-   dw[17] = gen8_3dstate_sbe[3];
-   dw[18] = gen8_3dstate_sbe_swiz[9];
-   dw[19] = gen8_3dstate_sbe_swiz[10];
+   /* see sbe_set_gen8_3DSTATE_SBE() */
+   dw[1] = sbe->sbe[0];
+
+   /* see raster_set_gen7_3DSTATE_SF() */
+   dw[2] = rs->sf[0];
+   dw[3] = rs->sf[1];
+   dw[4] = rs->sf[2];
+   dw[5] = rs->raster[1];
+   dw[6] = rs->raster[2];
+   dw[7] = rs->raster[3];
+
+   /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   memcpy(&dw[8], sbe->swiz, sizeof(*dw) * 8);
+
+   dw[16] = sbe->sbe[1];
+   dw[17] = sbe->sbe[2];
+   /* WrapShortest enables */
+   dw[18] = 0;
+   dw[19] = 0;
 }
 
 static inline void
 gen7_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_sf *sf,
-                enum pipe_format zs_format,
-                int sample_count)
+                const struct ilo_state_raster *rs)
 {
-   const uint8_t cmd_len = 7;
+   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 4 : 7;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   gen7_internal_3dstate_sf(builder, cmd_len, dw, sf, sample_count);
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-      int hw_format;
-
-      /* separate stencil */
-      switch (zs_format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         hw_format = GEN6_ZFORMAT_D16_UNORM;
-         break;
-      case PIPE_FORMAT_Z32_FLOAT:
-      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-         hw_format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-         hw_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-         break;
-      default:
-         /* FLOAT surface is assumed when there is no depth buffer */
-         hw_format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      }
-
-      dw[1] |= hw_format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
-   }
-}
-
-static inline void
-gen8_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_sf *sf)
-{
-   const uint8_t cmd_len = 4;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 8, 8);
+   ILO_DEV_ASSERT(builder->dev, 7, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
 
-   /* see rasterizer_init_sf_gen8() */
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   dw[1] = sf->payload[0];
-   dw[2] = sf->payload[1];
-   dw[3] = sf->payload[2];
+   /* see raster_set_gen7_3DSTATE_SF() or raster_set_gen8_3DSTATE_SF() */
+   dw[1] = rs->sf[0];
+   dw[2] = rs->sf[1];
+   dw[3] = rs->sf[2];
+   if (ilo_dev_gen(builder->dev) < ILO_GEN(8)) {
+      dw[4] = rs->raster[1];
+      dw[5] = rs->raster[2];
+      dw[6] = rs->raster[3];
+   }
 }
 
 static inline void
 gen7_3DSTATE_SBE(struct ilo_builder *builder,
-                 const struct ilo_shader_state *fs,
-                 int sprite_coord_mode)
+                 const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 14;
-   uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11];
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe),
-         gen8_3dstate_sbe, fs, sprite_coord_mode);
-   gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz),
-         gen8_3dstate_sbe_swiz, fs);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
-   dw[1] = gen8_3dstate_sbe[1];
-   memcpy(&dw[2], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8);
-   dw[10] = gen8_3dstate_sbe[2];
-   dw[11] = gen8_3dstate_sbe[3];
-   dw[12] = gen8_3dstate_sbe_swiz[9];
-   dw[13] = gen8_3dstate_sbe_swiz[10];
+   /* see sbe_set_gen8_3DSTATE_SBE() and sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   dw[1] = sbe->sbe[0];
+   memcpy(&dw[2], sbe->swiz, sizeof(*dw) * 8);
+   dw[10] = sbe->sbe[1];
+   dw[11] = sbe->sbe[2];
+
+   /* WrapShortest enables */
+   dw[12] = 0;
+   dw[13] = 0;
 }
 
 static inline void
 gen8_3DSTATE_SBE(struct ilo_builder *builder,
-                 const struct ilo_shader_state *fs,
-                 int sprite_coord_mode)
+                 const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
@@ -366,12 +152,16 @@ gen8_3DSTATE_SBE(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   gen8_internal_3dstate_sbe(builder, cmd_len, dw, fs, sprite_coord_mode);
+   /* see sbe_set_gen8_3DSTATE_SBE() */
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
+   dw[1] = sbe->sbe[0];
+   dw[2] = sbe->sbe[1];
+   dw[3] = sbe->sbe[2];
 }
 
 static inline void
 gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder,
-                      const struct ilo_shader_state *fs)
+                      const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 11;
    uint32_t *dw;
@@ -380,12 +170,17 @@ gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   gen8_internal_3dstate_sbe_swiz(builder, cmd_len, dw, fs);
+   dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2);
+   /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   memcpy(&dw[1], sbe->swiz, sizeof(*dw) * 8);
+   /* WrapShortest enables */
+   dw[9] = 0;
+   dw[10] = 0;
 }
 
 static inline void
 gen8_3DSTATE_RASTER(struct ilo_builder *builder,
-                    const struct ilo_rasterizer_sf *sf)
+                    const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 5;
    uint32_t *dw;
@@ -395,232 +190,108 @@ gen8_3DSTATE_RASTER(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_RASTER) | (cmd_len - 2);
-   dw[1] = sf->dw_raster;
-   dw[2] = sf->dw_depth_offset_const;
-   dw[3] = sf->dw_depth_offset_scale;
-   dw[4] = sf->dw_depth_offset_clamp;
+   /* see raster_set_gen8_3DSTATE_RASTER() */
+   dw[1] = rs->raster[0];
+   dw[2] = rs->raster[1];
+   dw[3] = rs->raster[2];
+   dw[4] = rs->raster[3];
 }
 
 static inline void
 gen6_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer,
-                bool dual_blend, bool cc_may_kill)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 9;
-   const int num_samples = 1;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-   dw6 = cso->payload[3];
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "This bit (Statistics Enable) must be disabled if either of these
-    *      bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve
-    *      Enable or Depth Buffer Resolve Enable."
-    */
-   dw4 |= GEN6_WM_DW4_STATISTICS;
-
-   if (cc_may_kill)
-      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL | GEN6_WM_DW5_PS_DISPATCH_ENABLE;
-
-   if (dual_blend)
-      dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
-
-   dw5 |= rasterizer->wm.payload[0];
-
-   dw6 |= rasterizer->wm.payload[1];
-
-   if (num_samples > 1) {
-      dw6 |= rasterizer->wm.dw_msaa_rast |
-             rasterizer->wm.dw_msaa_disp;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   dw[1] = kernel_offset;
+   /* see raster_set_gen6_3dstate_wm() and ps_set_gen6_3dstate_wm() */
+   dw[2] = ps->ps[0];
+   dw[3] = ps->ps[1];
+   dw[4] = rs->wm[0] | ps->ps[2];
+   dw[5] = rs->wm[1] | ps->ps[3];
+   dw[6] = rs->wm[2] | ps->ps[4];
    dw[7] = 0; /* kernel 1 */
    dw[8] = 0; /* kernel 2 */
 }
 
 static inline void
-gen6_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op)
-{
-   const uint8_t cmd_len = 9;
-   const int max_threads = (builder->dev->gt == 2) ? 80 : 40;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = hiz_op;
-   /* honor the valid range even if dispatching is disabled */
-   dw[5] = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT;
-   dw[6] = 0;
-   dw[7] = 0;
-   dw[8] = 0;
-}
-
-static inline void
 gen7_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer,
-                bool cc_may_kill)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_ps *ps)
 {
    const uint8_t cmd_len = 3;
-   const int num_samples = 1;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, dw2, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   /* see rasterizer_init_wm_gen7() */
-   dw1 = rasterizer->wm.payload[0];
-   dw2 = rasterizer->wm.payload[1];
-
-   /* see fs_init_cso_gen7() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw1 |= cso->payload[3];
-
-   dw1 |= GEN7_WM_DW1_STATISTICS;
-
-   if (cc_may_kill)
-      dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE | GEN7_WM_DW1_PS_KILL_PIXEL;
-
-   if (num_samples > 1) {
-      dw1 |= rasterizer->wm.dw_msaa_rast;
-      dw2 |= rasterizer->wm.dw_msaa_disp;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see raster_set_gen8_3DSTATE_WM() and ps_set_gen7_3dstate_wm() */
+   dw[1] = rs->wm[0] | ps->ps[0];
+   dw[2] = ps->ps[1];
 }
 
 static inline void
 gen8_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer)
+                const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, interps, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see rasterizer_get_wm_gen8() */
-   dw1 = rasterizer->wm.payload[0];
-   dw1 |= GEN7_WM_DW1_STATISTICS;
-
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   interps = cso->payload[4];
-
-   assert(!(dw1 & interps));
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = dw1 | interps;
-}
-
-static inline void
-gen7_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op)
-{
-   const uint8_t cmd_len = 3;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = hiz_op;
-   dw[2] = 0;
+   /* see raster_set_gen8_3DSTATE_WM() */
+   dw[1] = rs->wm[0];
 }
 
 static inline void
 gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_builder *builder,
-                              const struct ilo_dsa_state *dsa)
+                              const struct ilo_state_cc *cc)
 {
    const uint8_t cmd_len = 3;
-   uint32_t dw1, dw2, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = dsa->payload[0];
-   dw2 = dsa->payload[1];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_DEPTH_STENCIL) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL() */
+   dw[1] = cc->ds[0];
+   dw[2] = cc->ds[1];
 }
 
 static inline void
-gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder, uint32_t op,
-                      uint16_t width, uint16_t height, int sample_count)
+gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder,
+                      const struct ilo_state_raster *rs,
+                      uint16_t width, uint16_t height)
 {
    const uint8_t cmd_len = 5;
-   const uint32_t sample_mask = ((1 << sample_count) - 1) | 0x1;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = op;
-
-   switch (sample_count) {
-   case 0:
-   case 1:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1;
-      break;
-   case 2:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_2;
-      break;
-   case 4:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_4;
-      break;
-   case 8:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_8;
-      break;
-   case 16:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_16;
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_HZ_OP) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see raster_set_gen8_3dstate_wm_hz_op() */
+   dw[1] = rs->wm[1];
    dw[2] = 0;
-   /* exclusive? */
+   /* exclusive */
    dw[3] = height << 16 | width;
-   dw[4] = sample_mask;
+   dw[4] = rs->wm[2];
 }
 
 static inline void
@@ -656,100 +327,48 @@ gen8_3DSTATE_WM_CHROMAKEY(struct ilo_builder *builder)
 
 static inline void
 gen7_3DSTATE_PS(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                bool dual_blend)
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 8;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   /* see fs_init_cso_gen7() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
-   if (dual_blend)
-      dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see ps_set_gen7_3DSTATE_PS() */
+   dw[2] = ps->ps[2];
+   dw[3] = ps->ps[3];
+   dw[4] = ps->ps[4];
+   dw[5] = ps->ps[5];
    dw[6] = 0; /* kernel 1 */
    dw[7] = 0; /* kernel 2 */
 }
 
 static inline void
-gen7_disable_3DSTATE_PS(struct ilo_builder *builder)
-{
-   const uint8_t cmd_len = 8;
-   int max_threads;
-   uint32_t dw4, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   /* GPU hangs if none of the dispatch enable bits is set */
-   dw4 = GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
-
-   /* see brwCreateContext() */
-   switch (ilo_dev_gen(builder->dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (builder->dev->gt == 3) ? 408 :
-                    (builder->dev->gt == 2) ? 204 : 102;
-      dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   case ILO_GEN(7):
-   default:
-      max_threads = (builder->dev->gt == 2) ? 172 : 48;
-      dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   }
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = dw4;
-   dw[5] = 0;
-   dw[6] = 0;
-   dw[7] = 0;
-}
-
-static inline void
 gen8_3DSTATE_PS(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs)
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 12;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw3, dw6, dw7, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw3 = cso->payload[0];
-   dw6 = cso->payload[1];
-   dw7 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = dw3;
-   dw[4] = 0; /* scratch */
+   /* see ps_set_gen8_3DSTATE_PS() */
+   dw[3] = ps->ps[0];
+   dw[4] = ps->ps[1];
    dw[5] = 0;
-   dw[6] = dw6;
-   dw[7] = dw7;
+   dw[6] = ps->ps[2];
+   dw[7] = ps->ps[3];
    dw[8] = 0; /* kernel 1 */
    dw[9] = 0;
    dw[10] = 0; /* kernel 2 */
@@ -758,66 +377,34 @@ gen8_3DSTATE_PS(struct ilo_builder *builder,
 
 static inline void
 gen8_3DSTATE_PS_EXTRA(struct ilo_builder *builder,
-                      const struct ilo_shader_state *fs,
-                      bool cc_may_kill, bool per_sample)
+                      const struct ilo_state_ps *ps)
 {
    const uint8_t cmd_len = 2;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw1 = cso->payload[3];
-
-   if (cc_may_kill)
-      dw1 |= GEN8_PSX_DW1_DISPATCH_ENABLE | GEN8_PSX_DW1_KILL_PIXEL;
-   if (per_sample)
-      dw1 |= GEN8_PSX_DW1_PER_SAMPLE;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_EXTRA) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see ps_set_gen8_3DSTATE_PS_EXTRA() */
+   dw[1] = ps->ps[4];
 }
 
 static inline void
 gen8_3DSTATE_PS_BLEND(struct ilo_builder *builder,
-                      const struct ilo_blend_state *blend,
-                      const struct ilo_fb_state *fb,
-                      const struct ilo_dsa_state *dsa)
+                      const struct ilo_state_cc *cc)
 {
    const uint8_t cmd_len = 2;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = 0;
-   if (blend->alpha_to_coverage && fb->num_samples > 1)
-      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE;
-
-   if (fb->state.nr_cbufs && fb->state.cbufs[0]) {
-      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[0];
-
-      dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT;
-      if (caps->can_blend) {
-         if (caps->dst_alpha_forced_one)
-            dw1 |= blend->dw_ps_blend_dst_alpha_forced_one;
-         else
-            dw1 |= blend->dw_ps_blend;
-      }
-
-      if (caps->can_alpha_test)
-         dw1 |= dsa->dw_ps_blend_alpha;
-   } else {
-      dw1 |= dsa->dw_ps_blend_alpha;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_BLEND) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see cc_set_gen8_3DSTATE_PS_BLEND() */
+   dw[1] = cc->blend[0];
 }
 
 static inline void
@@ -862,101 +449,49 @@ gen7_3DSTATE_SAMPLER_STATE_POINTERS_PS(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_MULTISAMPLE(struct ilo_builder *builder,
-                         int num_samples, const uint32_t *pattern,
-                         bool pixel_location_center)
+                         const struct ilo_state_raster *rs,
+                         const struct ilo_state_sample_pattern *pattern,
+                         uint8_t sample_count)
 {
    const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? 4 : 3;
-   uint32_t dw1, dw2, dw3, *dw;
+   const uint32_t *packed = (const uint32_t *)
+      ilo_state_sample_pattern_get_packed_offsets(pattern,
+            builder->dev, sample_count);
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER :
-      GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER;
-
-   switch (num_samples) {
-   case 0:
-   case 1:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      dw2 = 0;
-      dw3 = 0;
-      break;
-   case 4:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
-      dw2 = pattern[0];
-      dw3 = 0;
-      break;
-   case 8:
-      assert(ilo_dev_gen(builder->dev) >= ILO_GEN(7));
-      dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
-      dw2 = pattern[0];
-      dw3 = pattern[1];
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      dw2 = 0;
-      dw3 = 0;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */
+   dw[1] = rs->sample[0];
+
+   /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */
+   dw[2] = (sample_count >= 4) ? packed[0] : 0;
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-      dw[3] = dw3;
+      dw[3] = (sample_count >= 8) ? packed[1] : 0;
 }
 
 static inline void
 gen8_3DSTATE_MULTISAMPLE(struct ilo_builder *builder,
-                         int num_samples,
-                         bool pixel_location_center)
+                         const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER :
-      GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER;
-
-   switch (num_samples) {
-   case 0:
-   case 1:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      break;
-   case 2:
-      dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2;
-      break;
-   case 4:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
-      break;
-   case 8:
-      dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
-      break;
-   case 16:
-      dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16;
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */
+   dw[1] = rs->sample[0];
 }
 
 static inline void
 gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder,
-                            const uint32_t *pattern_1x,
-                            const uint32_t *pattern_2x,
-                            const uint32_t *pattern_4x,
-                            const uint32_t *pattern_8x,
-                            const uint32_t *pattern_16x)
+                            const struct ilo_state_sample_pattern *pattern)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
@@ -966,61 +501,32 @@ gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SAMPLE_PATTERN) | (cmd_len - 2);
-   dw[1] = pattern_16x[3];
-   dw[2] = pattern_16x[2];
-   dw[3] = pattern_16x[1];
-   dw[4] = pattern_16x[0];
-   dw[5] = pattern_8x[1];
-   dw[6] = pattern_8x[0];
-   dw[7] = pattern_4x[0];
-   dw[8] = pattern_1x[0] << 16 |
-           pattern_2x[0];
+   dw[1] = 0;
+   dw[2] = 0;
+   dw[3] = 0;
+   dw[4] = 0;
+   /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */
+   dw[5] = ((const uint32_t *) pattern->pattern_8x)[1];
+   dw[6] = ((const uint32_t *) pattern->pattern_8x)[0];
+   dw[7] = ((const uint32_t *) pattern->pattern_4x)[0];
+   dw[8] = pattern->pattern_1x[0] << 16 |
+           ((const uint16_t *) pattern->pattern_2x)[0];
 }
 
 static inline void
 gen6_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder,
-                         unsigned sample_mask)
+                         const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   const unsigned valid_mask = 0xf;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   sample_mask &= valid_mask;
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2);
-   dw[1] = sample_mask;
-}
-
-static inline void
-gen7_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder,
-                         unsigned sample_mask,
-                         int num_samples)
-{
-   const uint8_t cmd_len = 2;
-   const unsigned valid_mask = ((1 << num_samples) - 1) | 0x1;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 294:
-    *
-    *     "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field
-    *      (Sample Mask) must be zero.
-    *
-    *      If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field
-    *      must be zero."
-    */
-   sample_mask &= valid_mask;
+   ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2);
-   dw[1] = sample_mask;
+   /* see raster_set_gen6_3DSTATE_SAMPLE_MASK() */
+   dw[1] = rs->sample[1];
 }
 
 static inline void
@@ -1070,95 +576,75 @@ gen6_3DSTATE_DRAWING_RECTANGLE(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_POLY_STIPPLE_OFFSET(struct ilo_builder *builder,
-                                 int x_offset, int y_offset)
+                                 const struct ilo_state_poly_stipple *stipple)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert(x_offset >= 0 && x_offset <= 31);
-   assert(y_offset >= 0 && y_offset <= 31);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_OFFSET) | (cmd_len - 2);
-   dw[1] = x_offset << 8 | y_offset;
+   /* constant */
+   dw[1] = 0;
 }
 
 static inline void
 gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_builder *builder,
-                                  const struct pipe_poly_stipple *pattern)
+                                  const struct ilo_state_poly_stipple *stipple)
 {
    const uint8_t cmd_len = 33;
    uint32_t *dw;
-   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_PATTERN) | (cmd_len - 2);
-   dw++;
-
-   STATIC_ASSERT(Elements(pattern->stipple) == 32);
-   for (i = 0; i < 32; i++)
-      dw[i] = pattern->stipple[i];
+   /* see poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN() */
+   memcpy(&dw[1], stipple->stipple, sizeof(stipple->stipple));
 }
 
 static inline void
 gen6_3DSTATE_LINE_STIPPLE(struct ilo_builder *builder,
-                          unsigned pattern, unsigned factor)
+                          const struct ilo_state_line_stipple *stipple)
 {
    const uint8_t cmd_len = 3;
-   unsigned inverse;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert((pattern & 0xffff) == pattern);
-   assert(factor >= 1 && factor <= 256);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_LINE_STIPPLE) | (cmd_len - 2);
-   dw[1] = pattern;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-      /* in U1.16 */
-      inverse = 65536 / factor;
-
-      dw[2] = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
-              factor;
-   }
-   else {
-      /* in U1.13 */
-      inverse = 8192 / factor;
-
-      dw[2] = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
-              factor;
-   }
+   /* see line_stipple_set_gen6_3DSTATE_LINE_STIPPLE() */
+   dw[1] = stipple->stipple[0];
+   dw[2] = stipple->stipple[1];
 }
 
 static inline void
-gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder)
+gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder,
+                                const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 3;
-   const uint32_t dw[3] = {
-      GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2),
-      0 << GEN6_AA_LINE_DW1_BIAS__SHIFT | 0,
-      0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT | 0,
-   };
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   ilo_builder_batch_write(builder, cmd_len, dw);
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2);
+   /* constant */
+   dw[1] = 0 << GEN6_AA_LINE_DW1_BIAS__SHIFT |
+           0 << GEN6_AA_LINE_DW1_SLOPE__SHIFT;
+   dw[2] = 0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT |
+           0 << GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT;
 }
 
 static inline void
 gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_zs_surface *zs,
-                          bool aligned_8x4)
+                          const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_DEPTH_BUFFER) :
@@ -1172,44 +658,49 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   dw[1] = zs->payload[0];
-   dw[2] = 0;
 
-   /* see ilo_gpe_init_zs_surface() */
+   /*
+    * see zs_set_gen6_3DSTATE_DEPTH_BUFFER() and
+    * zs_set_gen7_3DSTATE_DEPTH_BUFFER()
+    */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
+      dw[1] = zs->depth[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2];
-      dw[5] = zs->payload[3];
-      dw[6] = zs->payload[4];
-      dw[7] = zs->payload[5];
+      dw[4] = zs->depth[2];
+      dw[5] = zs->depth[3];
+      dw[6] = 0;
+      dw[7] = zs->depth[4];
 
       dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT;
 
-      if (zs->bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->bo,
-               zs->payload[1], INTEL_RELOC_WRITE);
+      if (zs->depth_bo) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo,
+               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
-      dw[3] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2];
-      dw[4] = zs->payload[3];
-      dw[5] = zs->payload[4];
-      dw[6] = zs->payload[5];
+      dw[1] = zs->depth[0];
+      dw[2] = 0;
+      dw[3] = zs->depth[2];
+      dw[4] = zs->depth[3];
+      dw[5] = 0;
+      dw[6] = zs->depth[4];
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
          dw[4] |= builder->mocs << GEN7_DEPTH_DW4_MOCS__SHIFT;
       else
          dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT;
 
-      if (zs->bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->bo,
-               zs->payload[1], INTEL_RELOC_WRITE);
+      if (zs->depth_bo) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo,
+               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
 
 static inline void
 gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
-                            const struct ilo_zs_surface *zs)
+                            const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_STENCIL_BUFFER) :
@@ -1223,33 +714,36 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   /* see ilo_gpe_init_zs_surface() */
-   dw[1] = zs->payload[6];
-   dw[2] = 0;
 
+   /* see zs_set_gen6_3DSTATE_STENCIL_BUFFER() */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
-
+      dw[1] = zs->stencil[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = zs->payload[8];
+      dw[4] = zs->stencil[2];
 
-      if (zs->separate_s8_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2,
-               zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE);
+      dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
+
+      if (zs->stencil_bo) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo,
+               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
+      dw[1] = zs->stencil[0];
+      dw[2] = 0;
+
       dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->separate_s8_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2,
-               zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE);
+      if (zs->stencil_bo) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo,
+               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
 
 static inline void
 gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
-                               const struct ilo_zs_surface *zs)
+                               const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_HIER_DEPTH_BUFFER) :
@@ -1263,26 +757,29 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   /* see ilo_gpe_init_zs_surface() */
-   dw[1] = zs->payload[9];
-   dw[2] = 0;
 
+   /* see zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER() */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
-
+      dw[1] = zs->hiz[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = zs->payload[11];
+      dw[4] = zs->hiz[2];
+
+      dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
 
       if (zs->hiz_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2,
-               zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE);
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo,
+               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
+      dw[1] = zs->hiz[0];
+      dw[2] = 0;
+
       dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT;
 
       if (zs->hiz_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2,
-               zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE);
+         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo,
+               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -1440,34 +937,24 @@ gen7_3DSTATE_BLEND_STATE_POINTERS(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_CLIP_VIEWPORT(struct ilo_builder *builder,
-                   const struct ilo_viewport_cso *viewports,
-                   unsigned num_viewports)
+                   const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 4 * num_viewports;
+   const int state_len = 4 * vp->count;
    uint32_t state_offset, *dw;
-   unsigned i;
+   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 193:
-    *
-    *     "The viewport-related state is stored as an array of up to 16
-    *      elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_CLIP_VIEWPORT, state_align, state_len, &dw);
 
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->min_gbx);
-      dw[1] = fui(vp->max_gbx);
-      dw[2] = fui(vp->min_gby);
-      dw[3] = fui(vp->max_gby);
+   for (i = 0; i < vp->count; i++) {
+      /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+      dw[0] = vp->sf_clip[i][8];
+      dw[1] = vp->sf_clip[i][9];
+      dw[2] = vp->sf_clip[i][10];
+      dw[3] = vp->sf_clip[i][11];
 
       dw += 4;
    }
@@ -1477,38 +964,21 @@ gen6_CLIP_VIEWPORT(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SF_VIEWPORT(struct ilo_builder *builder,
-                 const struct ilo_viewport_cso *viewports,
-                 unsigned num_viewports)
+                 const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 8 * num_viewports;
+   const int state_len = 8 * vp->count;
    uint32_t state_offset, *dw;
-   unsigned i;
+   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 262:
-    *
-    *     "The viewport-specific state used by the SF unit (SF_VIEWPORT) is
-    *      stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw);
 
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->m00);
-      dw[1] = fui(vp->m11);
-      dw[2] = fui(vp->m22);
-      dw[3] = fui(vp->m30);
-      dw[4] = fui(vp->m31);
-      dw[5] = fui(vp->m32);
-      dw[6] = 0;
-      dw[7] = 0;
+   for (i = 0; i < vp->count; i++) {
+      /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+      memcpy(dw, vp->sf_clip[i], sizeof(*dw) * 8);
 
       dw += 8;
    }
@@ -1518,298 +988,103 @@ gen6_SF_VIEWPORT(struct ilo_builder *builder,
 
 static inline uint32_t
 gen7_SF_CLIP_VIEWPORT(struct ilo_builder *builder,
-                      const struct ilo_viewport_cso *viewports,
-                      unsigned num_viewports)
+                      const struct ilo_state_viewport *vp)
 {
    const int state_align = 64;
-   const int state_len = 16 * num_viewports;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 16 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 270:
-    *
-    *     "The viewport-specific state used by both the SF and CL units
-    *      (SF_CLIP_VIEWPORT) is stored as an array of up to 16 elements, each
-    *      of which contains the DWords described below. The start of each
-    *      element is spaced 16 DWords apart. The location of first element of
-    *      the array, as specified by both Pointer to SF_VIEWPORT and Pointer
-    *      to CLIP_VIEWPORT, is aligned to a 64-byte boundary."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw);
-
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->m00);
-      dw[1] = fui(vp->m11);
-      dw[2] = fui(vp->m22);
-      dw[3] = fui(vp->m30);
-      dw[4] = fui(vp->m31);
-      dw[5] = fui(vp->m32);
-      dw[6] = 0;
-      dw[7] = 0;
-
-      dw[8] = fui(vp->min_gbx);
-      dw[9] = fui(vp->max_gbx);
-      dw[10] = fui(vp->min_gby);
-      dw[11] = fui(vp->max_gby);
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         dw[12] = fui(vp->min_x);
-         dw[13] = fui(vp->max_x - 1.0f);
-         dw[14] = fui(vp->min_y);
-         dw[15] = fui(vp->max_y - 1.0f);
-      } else {
-         dw[12] = 0;
-         dw[13] = 0;
-         dw[14] = 0;
-         dw[15] = 0;
-      }
-
-      dw += 16;
-   }
-
-   return state_offset;
+   /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SF_VIEWPORT,
+         state_align, state_len, (const uint32_t *) vp->sf_clip);
 }
 
 static inline uint32_t
 gen6_CC_VIEWPORT(struct ilo_builder *builder,
-                 const struct ilo_viewport_cso *viewports,
-                 unsigned num_viewports)
+                 const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 2 * num_viewports;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 2 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 385:
-    *
-    *     "The viewport state is stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_CC_VIEWPORT, state_align, state_len, &dw);
-
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->min_z);
-      dw[1] = fui(vp->max_z);
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see viewport_matrix_set_gen6_CC_VIEWPORT() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_CC_VIEWPORT,
+         state_align, state_len, (const uint32_t *) vp->cc);
 }
 
 static inline uint32_t
 gen6_SCISSOR_RECT(struct ilo_builder *builder,
-                  const struct ilo_scissor_state *scissor,
-                  unsigned num_viewports)
+                  const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 2 * num_viewports;
+   const int state_len = 2 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 263:
-    *
-    *     "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
-    *      stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-   assert(Elements(scissor->payload) >= state_len);
-
+   /* see viewport_scissor_set_gen6_SCISSOR_RECT() */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SCISSOR_RECT,
-         state_align, state_len, scissor->payload);
+         state_align, state_len, (const uint32_t *) vp->scissor);
 }
 
 static inline uint32_t
 gen6_COLOR_CALC_STATE(struct ilo_builder *builder,
-                      const struct pipe_stencil_ref *stencil_ref,
-                      ubyte alpha_ref,
-                      const struct pipe_blend_color *blend_color)
+                      const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
    const int state_len = 6;
-   uint32_t state_offset, *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_COLOR_CALC, state_align, state_len, &dw);
-
-   dw[0] = stencil_ref->ref_value[0] << 24 |
-           stencil_ref->ref_value[1] << 16 |
-           GEN6_CC_DW0_ALPHATEST_UNORM8;
-   dw[1] = alpha_ref;
-   dw[2] = fui(blend_color->color[0]);
-   dw[3] = fui(blend_color->color[1]);
-   dw[4] = fui(blend_color->color[2]);
-   dw[5] = fui(blend_color->color[3]);
-
-   return state_offset;
+   /* see cc_params_set_gen6_COLOR_CALC_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_COLOR_CALC,
+         state_align, state_len, cc->cc);
 }
 
 static inline uint32_t
 gen6_DEPTH_STENCIL_STATE(struct ilo_builder *builder,
-                         const struct ilo_dsa_state *dsa)
+                         const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
    const int state_len = 3;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   STATIC_ASSERT(Elements(dsa->payload) >= state_len);
-
+   /* see cc_set_gen6_DEPTH_STENCIL_STATE() */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_DEPTH_STENCIL,
-         state_align, state_len, dsa->payload);
+         state_align, state_len, cc->ds);
 }
 
 static inline uint32_t
 gen6_BLEND_STATE(struct ilo_builder *builder,
-                 const struct ilo_blend_state *blend,
-                 const struct ilo_fb_state *fb,
-                 const struct ilo_dsa_state *dsa)
+                 const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
-   int state_len;
-   uint32_t state_offset, *dw;
-   unsigned num_targets, i;
+   const int state_len = 2 * cc->blend_state_count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 376:
-    *
-    *     "The blend state is stored as an array of up to 8 elements..."
-    */
-   num_targets = fb->state.nr_cbufs;
-   assert(num_targets <= 8);
-
-   if (!num_targets) {
-      if (!dsa->dw_blend_alpha)
-         return 0;
-      /* to be able to reference alpha func */
-      num_targets = 1;
-   }
-
-   state_len = 2 * num_targets;
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw);
-
-   for (i = 0; i < num_targets; i++) {
-      const struct ilo_blend_cso *cso = &blend->cso[i];
-
-      dw[0] = cso->payload[0];
-      dw[1] = cso->payload[1] | blend->dw_shared;
-
-      if (i < fb->state.nr_cbufs && fb->state.cbufs[i]) {
-         const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
-
-         if (caps->can_blend) {
-            if (caps->dst_alpha_forced_one)
-               dw[0] |= cso->dw_blend_dst_alpha_forced_one;
-            else
-               dw[0] |= cso->dw_blend;
-         }
-
-         if (caps->can_logicop)
-            dw[1] |= blend->dw_logicop;
-
-         if (caps->can_alpha_test)
-            dw[1] |= dsa->dw_blend_alpha;
-      } else {
-         dw[1] |= GEN6_RT_DW1_WRITE_DISABLE_A |
-                  GEN6_RT_DW1_WRITE_DISABLE_R |
-                  GEN6_RT_DW1_WRITE_DISABLE_G |
-                  GEN6_RT_DW1_WRITE_DISABLE_B |
-                  dsa->dw_blend_alpha;
-      }
+   if (!state_len)
+      return 0;
 
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 356:
-       *
-       *     "When NumSamples = 1, AlphaToCoverage and AlphaToCoverage
-       *      Dither both must be disabled."
-       *
-       * There is no such limitation on GEN7, or for AlphaToOne.  But GL
-       * requires that anyway.
-       */
-      if (fb->num_samples > 1)
-         dw[1] |= blend->dw_alpha_mod;
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see cc_set_gen6_BLEND_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND,
+         state_align, state_len, cc->blend);
 }
 
 static inline uint32_t
 gen8_BLEND_STATE(struct ilo_builder *builder,
-                 const struct ilo_blend_state *blend,
-                 const struct ilo_fb_state *fb,
-                 const struct ilo_dsa_state *dsa)
+                 const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
-   const int state_len = 1 + 2 * fb->state.nr_cbufs;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 1 + 2 * cc->blend_state_count;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   assert(fb->state.nr_cbufs <= 8);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw);
-
-   dw[0] = blend->dw_shared;
-   if (fb->num_samples > 1)
-      dw[0] |= blend->dw_alpha_mod;
-   if (!fb->state.nr_cbufs || fb->blend_caps[0].can_alpha_test)
-      dw[0] |= dsa->dw_blend_alpha;
-   dw++;
-
-   for (i = 0; i < fb->state.nr_cbufs; i++) {
-      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
-      const struct ilo_blend_cso *cso = &blend->cso[i];
-
-      dw[0] = cso->payload[0];
-      dw[1] = cso->payload[1];
-
-      if (fb->state.cbufs[i]) {
-         if (caps->can_blend) {
-            if (caps->dst_alpha_forced_one)
-               dw[0] |= cso->dw_blend_dst_alpha_forced_one;
-            else
-               dw[0] |= cso->dw_blend;
-         }
-
-         if (caps->can_logicop)
-            dw[1] |= blend->dw_logicop;
-      } else {
-         dw[0] |= GEN8_RT_DW0_WRITE_DISABLE_A |
-                  GEN8_RT_DW0_WRITE_DISABLE_R |
-                  GEN8_RT_DW0_WRITE_DISABLE_G |
-                  GEN8_RT_DW0_WRITE_DISABLE_B;
-      }
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see cc_set_gen8_BLEND_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND,
+         state_align, state_len, &cc->blend[1]);
 }
 
 #endif /* ILO_BUILDER_3D_BOTTOM_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 05dbce7c905..8d30095e6f6 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -29,303 +29,167 @@
 #define ILO_BUILDER_3D_TOP_H
 
 #include "genhw/genhw.h"
-#include "../ilo_resource.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
-#include "ilo_state_3d.h"
+#include "ilo_state_sampler.h"
+#include "ilo_state_shader.h"
+#include "ilo_state_sol.h"
+#include "ilo_state_surface.h"
+#include "ilo_state_urb.h"
+#include "ilo_state_vf.h"
 #include "ilo_builder.h"
 
 static inline void
 gen6_3DSTATE_URB(struct ilo_builder *builder,
-                 int vs_total_size, int gs_total_size,
-                 int vs_entry_size, int gs_entry_size)
+                 const struct ilo_state_urb *urb)
 {
    const uint8_t cmd_len = 3;
-   const int row_size = 128; /* 1024 bits */
-   int vs_alloc_size, gs_alloc_size;
-   int vs_num_entries, gs_num_entries;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   /* in 1024-bit URB rows */
-   vs_alloc_size = (vs_entry_size + row_size - 1) / row_size;
-   gs_alloc_size = (gs_entry_size + row_size - 1) / row_size;
-
-   /* the valid range is [1, 5] */
-   if (!vs_alloc_size)
-      vs_alloc_size = 1;
-   if (!gs_alloc_size)
-      gs_alloc_size = 1;
-   assert(vs_alloc_size <= 5 && gs_alloc_size <= 5);
-
-   /* the valid range is [24, 256] in multiples of 4 */
-   vs_num_entries = (vs_total_size / row_size / vs_alloc_size) & ~3;
-   if (vs_num_entries > 256)
-      vs_num_entries = 256;
-   assert(vs_num_entries >= 24);
-
-   /* the valid range is [0, 256] in multiples of 4 */
-   gs_num_entries = (gs_total_size / row_size / gs_alloc_size) & ~3;
-   if (gs_num_entries > 256)
-      gs_num_entries = 256;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_URB) | (cmd_len - 2);
-   dw[1] = (vs_alloc_size - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT |
-           vs_num_entries << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT;
-   dw[2] = gs_num_entries << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT |
-           (gs_alloc_size - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT;
+   /* see urb_set_gen6_3DSTATE_URB() */
+   dw[1] = urb->urb[0];
+   dw[2] = urb->urb[1];
 }
 
 static inline void
-gen7_3dstate_push_constant_alloc(struct ilo_builder *builder,
-                                 int subop, int offset, int size)
+gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder,
+                                    const struct ilo_state_urb *urb)
 {
-   const uint32_t cmd = GEN6_RENDER_TYPE_RENDER |
-                        GEN6_RENDER_SUBTYPE_3D |
-                        subop;
    const uint8_t cmd_len = 2;
-   const int slice_count = ((ilo_dev_gen(builder->dev) == ILO_GEN(7.5) &&
-                             builder->dev->gt == 3) ||
-                            ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 2 : 1;
    uint32_t *dw;
-   int end;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /* VS, HS, DS, GS, and PS variants */
-   assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS &&
-          subop <= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 68:
-    *
-    *     "(A table that says the maximum size of each constant buffer is
-    *      16KB")
-    *
-    * From the Ivy Bridge PRM, volume 2 part 1, page 115:
-    *
-    *     "The sum of the Constant Buffer Offset and the Constant Buffer Size
-    *      may not exceed the maximum value of the Constant Buffer Size."
-    *
-    * Thus, the valid range of buffer end is [0KB, 16KB].
-    */
-   end = (offset + size) / 1024;
-   if (end > 16 * slice_count) {
-      assert(!"invalid constant buffer end");
-      end = 16 * slice_count;
-   }
-
-   /* the valid range of buffer offset is [0KB, 15KB] */
-   offset = (offset + 1023) / 1024;
-   if (offset > 15 * slice_count) {
-      assert(!"invalid constant buffer offset");
-      offset = 15 * slice_count;
-   }
-
-   if (offset > end) {
-      assert(!size);
-      offset = end;
-   }
-
-   /* the valid range of buffer size is [0KB, 15KB] */
-   size = end - offset;
-   if (size > 15 * slice_count) {
-      assert(!"invalid constant buffer size");
-      size = 15 * slice_count;
-   }
-
-   assert(offset % slice_count == 0 && size % slice_count == 0);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = cmd | (cmd_len - 2);
-   dw[1] = offset << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
-           size;
-}
-
-static inline void
-gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder,
-                                    int offset, int size)
-{
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS, offset, size);
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_VS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[0];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_HS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_HS, offset, size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_HS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[1];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_DS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_DS, offset, size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_DS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[2];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_GS, offset, size);
-}
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
 
-static inline void
-gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder,
-                                    int offset, int size)
-{
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS, offset, size);
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_GS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[3];
 }
 
 static inline void
-gen7_3dstate_urb(struct ilo_builder *builder,
-                 int subop, int offset, int size,
-                 int entry_size)
+gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder,
+                                    const struct ilo_state_urb *urb)
 {
-   const uint32_t cmd = GEN6_RENDER_TYPE_RENDER |
-                        GEN6_RENDER_SUBTYPE_3D |
-                        subop;
    const uint8_t cmd_len = 2;
-   const int row_size = 64; /* 512 bits */
-   int alloc_size, num_entries, min_entries, max_entries;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /* VS, HS, DS, and GS variants */
-   assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_URB_VS &&
-          subop <= GEN7_RENDER_OPCODE_3DSTATE_URB_GS);
-
-   /* in multiples of 8KB */
-   assert(offset % 8192 == 0);
-   offset /= 8192;
-
-   /* in multiple of 512-bit rows */
-   alloc_size = (entry_size + row_size - 1) / row_size;
-   if (!alloc_size)
-      alloc_size = 1;
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 34:
-    *
-    *     "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may
-    *      cause performance to decrease due to banking in the URB. Element
-    *      sizes of 16 to 20 should be programmed with six 512-bit URB rows."
-    */
-   if (subop == GEN7_RENDER_OPCODE_3DSTATE_URB_VS && alloc_size == 5)
-      alloc_size = 6;
-
-   /* in multiples of 8 */
-   num_entries = (size / row_size / alloc_size) & ~7;
-
-   switch (subop) {
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_VS:
-      switch (ilo_dev_gen(builder->dev)) {
-      case ILO_GEN(8):
-         max_entries = 2560;
-         min_entries = 64;
-         break;
-      case ILO_GEN(7.5):
-         max_entries = (builder->dev->gt >= 2) ? 1664 : 640;
-         min_entries = (builder->dev->gt >= 2) ? 64 : 32;
-         break;
-      case ILO_GEN(7):
-      default:
-         max_entries = (builder->dev->gt == 2) ? 704 : 512;
-         min_entries = 32;
-         break;
-      }
-
-      assert(num_entries >= min_entries);
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_HS:
-      max_entries = (builder->dev->gt == 2) ? 64 : 32;
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_DS:
-      if (num_entries)
-         assert(num_entries >= 138);
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_GS:
-      switch (ilo_dev_gen(builder->dev)) {
-      case ILO_GEN(8):
-         max_entries = 960;
-         break;
-      case ILO_GEN(7.5):
-         max_entries = (builder->dev->gt >= 2) ? 640 : 256;
-         break;
-      case ILO_GEN(7):
-      default:
-         max_entries = (builder->dev->gt == 2) ? 320 : 192;
-         break;
-      }
-
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   default:
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = cmd | (cmd_len - 2);
-   dw[1] = offset << GEN7_URB_DW1_OFFSET__SHIFT |
-           (alloc_size - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT |
-           num_entries;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_PS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[4];
 }
 
 static inline void
 gen7_3DSTATE_URB_VS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_VS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_VS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[0];
 }
 
 static inline void
 gen7_3DSTATE_URB_HS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_HS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_HS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[1];
 }
 
 static inline void
 gen7_3DSTATE_URB_DS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_DS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_DS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[2];
 }
 
 static inline void
 gen7_3DSTATE_URB_GS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_GS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_GS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[3];
 }
 
 static inline void
 gen75_3DSTATE_VF(struct ilo_builder *builder,
-                 bool enable_cut_index,
-                 uint32_t cut_index)
+                 const struct ilo_state_vf *vf)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
@@ -334,11 +198,10 @@ gen75_3DSTATE_VF(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2);
-   if (enable_cut_index)
-      dw[0] |= GEN75_VF_DW0_CUT_INDEX_ENABLE;
-
-   dw[1] = cut_index;
+   /* see vf_params_set_gen75_3DSTATE_VF() */
+   dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2) |
+           vf->cut[0];
+   dw[1] = vf->cut[1];
 }
 
 static inline void
@@ -354,40 +217,11 @@ gen6_3DSTATE_VF_STATISTICS(struct ilo_builder *builder,
    ilo_builder_batch_write(builder, cmd_len, &dw0);
 }
 
-/**
- * Translate a pipe primitive type to the matching hardware primitive type.
- */
-static inline int
-gen6_3d_translate_pipe_prim(unsigned prim)
-{
-   static const int prim_mapping[ILO_PRIM_MAX] = {
-      [PIPE_PRIM_POINTS]                     = GEN6_3DPRIM_POINTLIST,
-      [PIPE_PRIM_LINES]                      = GEN6_3DPRIM_LINELIST,
-      [PIPE_PRIM_LINE_LOOP]                  = GEN6_3DPRIM_LINELOOP,
-      [PIPE_PRIM_LINE_STRIP]                 = GEN6_3DPRIM_LINESTRIP,
-      [PIPE_PRIM_TRIANGLES]                  = GEN6_3DPRIM_TRILIST,
-      [PIPE_PRIM_TRIANGLE_STRIP]             = GEN6_3DPRIM_TRISTRIP,
-      [PIPE_PRIM_TRIANGLE_FAN]               = GEN6_3DPRIM_TRIFAN,
-      [PIPE_PRIM_QUADS]                      = GEN6_3DPRIM_QUADLIST,
-      [PIPE_PRIM_QUAD_STRIP]                 = GEN6_3DPRIM_QUADSTRIP,
-      [PIPE_PRIM_POLYGON]                    = GEN6_3DPRIM_POLYGON,
-      [PIPE_PRIM_LINES_ADJACENCY]            = GEN6_3DPRIM_LINELIST_ADJ,
-      [PIPE_PRIM_LINE_STRIP_ADJACENCY]       = GEN6_3DPRIM_LINESTRIP_ADJ,
-      [PIPE_PRIM_TRIANGLES_ADJACENCY]        = GEN6_3DPRIM_TRILIST_ADJ,
-      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]   = GEN6_3DPRIM_TRISTRIP_ADJ,
-      [ILO_PRIM_RECTANGLES]                  = GEN6_3DPRIM_RECTLIST,
-   };
-
-   assert(prim_mapping[prim]);
-
-   return prim_mapping[prim];
-}
-
 static inline void
-gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim)
+gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder,
+                         enum gen_3dprim_type topology)
 {
    const uint8_t cmd_len = 2;
-   const int prim = gen6_3d_translate_pipe_prim(pipe_prim);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
@@ -395,12 +229,13 @@ gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_TOPOLOGY) | (cmd_len - 2);
-   dw[1] = prim;
+   dw[1] = topology << GEN8_TOPOLOGY_DW1_TYPE__SHIFT;
 }
 
 static inline void
 gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder,
-                           int vb_index, uint32_t step_rate)
+                           const struct ilo_state_vf *vf,
+                           uint32_t attr)
 {
    const uint8_t cmd_len = 3;
    uint32_t *dw;
@@ -410,16 +245,20 @@ gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_INSTANCING) | (cmd_len - 2);
-   dw[1] = vb_index;
-   if (step_rate)
-      dw[1] |= GEN8_INSTANCING_DW1_ENABLE;
-   dw[2] = step_rate;
+   dw[1] = attr << GEN8_INSTANCING_DW1_VE_INDEX__SHIFT;
+   dw[2] = 0;
+   /* see vf_set_gen8_3DSTATE_VF_INSTANCING() */
+   if (attr >= vf->internal_ve_count) {
+      attr -= vf->internal_ve_count;
+
+      dw[1] |= vf->user_instancing[attr][0];
+      dw[2] |= vf->user_instancing[attr][1];
+   }
 }
 
 static inline void
 gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder,
-                     bool vid_enable, int vid_ve, int vid_comp,
-                     bool iid_enable, int iid_ve, int iid_comp)
+                     const struct ilo_state_vf *vf)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
@@ -429,29 +268,19 @@ gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_SGVS) | (cmd_len - 2);
-   dw[1] = 0;
-
-   if (iid_enable) {
-      dw[1] |= GEN8_SGVS_DW1_IID_ENABLE |
-               vid_comp << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT |
-               vid_ve << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT;
-   }
-
-   if (vid_enable) {
-      dw[1] |= GEN8_SGVS_DW1_VID_ENABLE |
-               vid_comp << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT |
-               vid_ve << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT;
-   }
+   /* see vf_params_set_gen8_3DSTATE_VF_SGVS() */
+   dw[1] = vf->sgvs[0];
 }
 
 static inline void
 gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
-                            const struct ilo_ve_state *ve,
-                            const struct ilo_vb_state *vb)
+                            const struct ilo_state_vf *vf,
+                            const struct ilo_state_vertex_buffer *vb,
+                            unsigned vb_count)
 {
    uint8_t cmd_len;
    uint32_t *dw;
-   unsigned pos, hw_idx;
+   unsigned pos, i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
@@ -460,67 +289,52 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
     *
     *     "From 1 to 33 VBs can be specified..."
     */
-   assert(ve->vb_count <= 33);
+   assert(vb_count <= 33);
 
-   if (!ve->vb_count)
+   if (!vb_count)
       return;
 
-   cmd_len = 1 + 4 * ve->vb_count;
+   cmd_len = 1 + 4 * vb_count;
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_BUFFERS) | (cmd_len - 2);
    dw++;
    pos++;
 
-   for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
-      const unsigned instance_divisor = ve->instance_divisors[hw_idx];
-      const unsigned pipe_idx = ve->vb_mapping[hw_idx];
-      const struct pipe_vertex_buffer *cso = &vb->states[pipe_idx];
+   for (i = 0; i < vb_count; i++) {
+      const struct ilo_state_vertex_buffer *b = &vb[i];
 
-      dw[0] = hw_idx << GEN6_VB_DW0_INDEX__SHIFT;
+      /* see vertex_buffer_set_gen8_vertex_buffer_state() */
+      dw[0] = b->vb[0] |
+              i << GEN6_VB_DW0_INDEX__SHIFT;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8))
          dw[0] |= builder->mocs << GEN8_VB_DW0_MOCS__SHIFT;
       else
          dw[0] |= builder->mocs << GEN6_VB_DW0_MOCS__SHIFT;
 
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-         dw[0] |= GEN7_VB_DW0_ADDR_MODIFIED;
-
-      if (instance_divisor)
-         dw[0] |= GEN6_VB_DW0_ACCESS_INSTANCEDATA;
-      else
-         dw[0] |= GEN6_VB_DW0_ACCESS_VERTEXDATA;
-
-      /* use null vb if there is no buffer or the stride is out of range */
-      if (!cso->buffer || cso->stride > 2048) {
-         dw[0] |= GEN6_VB_DW0_IS_NULL;
-         dw[1] = 0;
-         dw[2] = 0;
-         dw[3] = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ?
-            0 : instance_divisor;
-
-         continue;
-      }
-
-      dw[0] |= cso->stride << GEN6_VB_DW0_PITCH__SHIFT;
+      dw[1] = 0;
+      dw[2] = 0;
+      dw[3] = 0;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         const struct ilo_buffer *buf = ilo_buffer(cso->buffer);
-         const uint32_t start_offset = cso->buffer_offset;
+         if (b->need_bo)
+            ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0);
 
-         ilo_builder_batch_reloc64(builder, pos + 1,
-               buf->bo, start_offset, 0);
-         dw[3] = buf->bo_size;
+         dw[3] |= b->vb[2];
       } else {
-         const struct ilo_buffer *buf = ilo_buffer(cso->buffer);
-         const uint32_t start_offset = cso->buffer_offset;
-         const uint32_t end_offset = buf->bo_size - 1;
+         const int8_t elem = vf->vb_to_first_elem[i];
 
-         dw[3] = instance_divisor;
+         /* see vf_set_gen6_vertex_buffer_state() */
+         if (elem >= 0) {
+            dw[0] |= vf->user_instancing[elem][0];
+            dw[3] |= vf->user_instancing[elem][1];
+         }
 
-         ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0);
-         ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0);
+         if (b->need_bo) {
+            ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0);
+            ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0);
+         }
       }
 
       dw += 4;
@@ -563,248 +377,189 @@ gen6_user_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_builder *builder,
-                             const struct ilo_ve_state *ve)
+                             const struct ilo_state_vf *vf)
 {
    uint8_t cmd_len;
    uint32_t *dw;
-   unsigned i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 92:
-    *
-    *    "At least one VERTEX_ELEMENT_STATE structure must be included."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 93:
-    *
-    *     "Up to 34 (DevSNB+) vertex elements are supported."
-    */
-   assert(ve->count + ve->prepend_nosrc_cso >= 1);
-   assert(ve->count + ve->prepend_nosrc_cso <= 34);
-
-   STATIC_ASSERT(Elements(ve->cso[0].payload) == 2);
+   cmd_len = 1 + 2 * (vf->internal_ve_count + vf->user_ve_count);
 
-   cmd_len = 1 + 2 * (ve->count + ve->prepend_nosrc_cso);
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_ELEMENTS) | (cmd_len - 2);
    dw++;
 
-   if (ve->prepend_nosrc_cso) {
-      memcpy(dw, ve->nosrc_cso.payload, sizeof(ve->nosrc_cso.payload));
-      dw += 2;
-   }
-
-   for (i = 0; i < ve->count - ve->last_cso_edgeflag; i++) {
-      memcpy(dw, ve->cso[i].payload, sizeof(ve->cso[i].payload));
-      dw += 2;
+   /*
+    * see vf_params_set_gen6_internal_ve() and
+    * vf_set_gen6_3DSTATE_VERTEX_ELEMENTS()
+    */
+   if (vf->internal_ve_count) {
+      memcpy(dw, vf->internal_ve,
+            sizeof(vf->internal_ve[0]) * vf->internal_ve_count);
+      dw += 2 * vf->internal_ve_count;
    }
 
-   if (ve->last_cso_edgeflag)
-      memcpy(dw, ve->edgeflag_cso.payload, sizeof(ve->edgeflag_cso.payload));
+   memcpy(dw, vf->user_ve, sizeof(vf->user_ve[0]) * vf->user_ve_count);
 }
 
 static inline void
 gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_ib_state *ib,
-                          bool enable_cut_index)
+                          const struct ilo_state_vf *vf,
+                          const struct ilo_state_index_buffer *ib)
 {
    const uint8_t cmd_len = 3;
-   struct ilo_buffer *buf = ilo_buffer(ib->hw_resource);
-   uint32_t start_offset, end_offset;
-   int format;
-   uint32_t *dw;
+   uint32_t dw0, *dw;
    unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   if (!buf)
-      return;
-
-   /* this is moved to the new 3DSTATE_VF */
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5))
-      assert(!enable_cut_index);
-
-   switch (ib->hw_index_size) {
-   case 4:
-      format = GEN6_IB_DW0_FORMAT_DWORD;
-      break;
-   case 2:
-      format = GEN6_IB_DW0_FORMAT_WORD;
-      break;
-   case 1:
-      format = GEN6_IB_DW0_FORMAT_BYTE;
-      break;
-   default:
-      assert(!"unknown index size");
-      format = GEN6_IB_DW0_FORMAT_BYTE;
-      break;
-   }
+   dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) |
+         builder->mocs << GEN6_IB_DW0_MOCS__SHIFT;
 
    /*
-    * set start_offset to 0 here and adjust pipe_draw_info::start with
-    * ib->draw_start_offset in 3DPRIMITIVE
+    * see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() and
+    * vf_params_set_gen6_3dstate_index_buffer()
     */
-   start_offset = 0;
-   end_offset = buf->bo_size;
-
-   /* end_offset must also be aligned and is inclusive */
-   end_offset -= (end_offset % ib->hw_index_size);
-   end_offset--;
+   dw0 |= ib->ib[0];
+   if (ilo_dev_gen(builder->dev) <= ILO_GEN(7))
+      dw0 |= vf->cut[0];
 
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) |
-           builder->mocs << GEN6_IB_DW0_MOCS__SHIFT |
-           format;
-   if (enable_cut_index)
-      dw[0] |= GEN6_IB_DW0_CUT_INDEX_ENABLE;
-
-   ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0);
-   ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0);
+   dw[0] = dw0;
+   if (ib->need_bo) {
+      ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0);
+      ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0);
+   } else {
+      dw[1] = 0;
+      dw[2] = 0;
+   }
 }
 
 static inline void
 gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_ib_state *ib)
+                          const struct ilo_state_vf *vf,
+                          const struct ilo_state_index_buffer *ib)
 {
    const uint8_t cmd_len = 5;
-   struct ilo_buffer *buf = ilo_buffer(ib->hw_resource);
-   int format;
    uint32_t *dw;
    unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   if (!buf)
-      return;
-
-   switch (ib->hw_index_size) {
-   case 4:
-      format = GEN8_IB_DW1_FORMAT_DWORD;
-      break;
-   case 2:
-      format = GEN8_IB_DW1_FORMAT_WORD;
-      break;
-   case 1:
-      format = GEN8_IB_DW1_FORMAT_BYTE;
-      break;
-   default:
-      assert(!"unknown index size");
-      format = GEN8_IB_DW1_FORMAT_BYTE;
-      break;
-   }
-
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2);
-   dw[1] = format |
+   /* see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() */
+   dw[1] = ib->ib[0] |
            builder->mocs << GEN8_IB_DW1_MOCS__SHIFT;
-   dw[4] = buf->bo_size;
 
-   /* ignore ib->offset here in favor of adjusting 3DPRIMITIVE */
-   ilo_builder_batch_reloc64(builder, pos + 2, buf->bo, 0, 0);
+   if (ib->need_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0);
+   } else {
+      dw[2] = 0;
+      dw[3] = 0;
+   }
+
+   dw[4] = ib->ib[2];
 }
 
 static inline void
 gen6_3DSTATE_VS(struct ilo_builder *builder,
-                const struct ilo_shader_state *vs)
+                const struct ilo_state_vs *vs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 6;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   cso = ilo_shader_get_kernel_cso(vs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see vs_set_gen6_3DSTATE_VS() */
+   dw[2] = vs->vs[0];
+   dw[3] = vs->vs[1];
+   dw[4] = vs->vs[2];
+   dw[5] = vs->vs[3];
 }
 
 static inline void
 gen8_3DSTATE_VS(struct ilo_builder *builder,
-                const struct ilo_shader_state *vs,
-                uint32_t clip_plane_enable)
+                const struct ilo_state_vs *vs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 9;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw3, dw6, dw7, dw8, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   cso = ilo_shader_get_kernel_cso(vs);
-   dw3 = cso->payload[0];
-   dw6 = cso->payload[1];
-   dw7 = cso->payload[2];
-   dw8 = clip_plane_enable << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs);
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = dw3;
-   dw[4] = 0; /* scratch */
+   /* see vs_set_gen6_3DSTATE_VS() */
+   dw[3] = vs->vs[0];
+   dw[4] = vs->vs[1];
    dw[5] = 0;
-   dw[6] = dw6;
-   dw[7] = dw7;
-   dw[8] = dw8;
+   dw[6] = vs->vs[2];
+   dw[7] = vs->vs[3];
+   dw[8] = vs->vs[4];
 }
 
 static inline void
-gen6_disable_3DSTATE_VS(struct ilo_builder *builder)
+gen7_3DSTATE_HS(struct ilo_builder *builder,
+                const struct ilo_state_hs *hs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = 6;
+   const uint8_t cmd_len = 7;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-   dw[5] = 0;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
+   /* see hs_set_gen7_3DSTATE_HS() */
+   dw[1] = hs->hs[0];
+   dw[2] = hs->hs[1];
+   dw[3] = kernel_offset;
+   dw[4] = hs->hs[2];
+   dw[5] = hs->hs[3];
+   dw[6] = 0;
 }
 
 static inline void
-gen7_disable_3DSTATE_HS(struct ilo_builder *builder)
+gen8_3DSTATE_HS(struct ilo_builder *builder,
+                const struct ilo_state_hs *hs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 7;
+   const uint8_t cmd_len = 9;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see hs_set_gen7_3DSTATE_HS() */
+   dw[1] = hs->hs[0];
+   dw[2] = hs->hs[1];
+   dw[3] = kernel_offset;
    dw[4] = 0;
-   dw[5] = 0;
+   dw[5] = hs->hs[2];
    dw[6] = 0;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[7] = 0;
-      dw[8] = 0;
-   }
+   dw[7] = hs->hs[3];
+   dw[8] = 0;
 }
 
 static inline void
-gen7_3DSTATE_TE(struct ilo_builder *builder)
+gen7_3DSTATE_TE(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
@@ -814,108 +569,61 @@ gen7_3DSTATE_TE(struct ilo_builder *builder)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_TE) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see ds_set_gen7_3DSTATE_TE() */
+   dw[1] = ds->te[0];
+   dw[2] = ds->te[1];
+   dw[3] = ds->te[2];
 }
 
 static inline void
-gen7_disable_3DSTATE_DS(struct ilo_builder *builder)
+gen7_3DSTATE_DS(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 6;
+   const uint8_t cmd_len = 6;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-   dw[5] = 0;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[6] = 0;
-      dw[7] = 0;
-      dw[8] = 0;
-   }
-}
-
-static inline void
-gen6_3DSTATE_GS(struct ilo_builder *builder,
-                const struct ilo_shader_state *gs)
-{
-   const uint8_t cmd_len = 7;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   cso = ilo_shader_get_kernel_cso(gs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-   dw6 = cso->payload[3];
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(gs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   /* see ds_set_gen7_3DSTATE_DS() */
+   dw[1] = kernel_offset;
+   dw[2] = ds->ds[0];
+   dw[3] = ds->ds[1];
+   dw[4] = ds->ds[2];
+   dw[5] = ds->ds[3];
 }
 
 static inline void
-gen6_so_3DSTATE_GS(struct ilo_builder *builder,
-                   const struct ilo_shader_state *vs,
-                   int verts_per_prim)
+gen8_3DSTATE_DS(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = 7;
-   struct ilo_shader_cso cso;
-   enum ilo_kernel_param param;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   assert(ilo_shader_get_kernel_param(vs, ILO_KERNEL_VS_GEN6_SO));
-
-   switch (verts_per_prim) {
-   case 1:
-      param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET;
-      break;
-   case 2:
-      param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET;
-      break;
-   default:
-      param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET;
-      break;
-   }
+   const uint8_t cmd_len = 9;
+   uint32_t *dw;
 
-   /* cannot use VS's CSO */
-   ilo_gpe_init_gs_cso(builder->dev, vs, &cso);
-   dw2 = cso.payload[0];
-   dw4 = cso.payload[1];
-   dw5 = cso.payload[2];
-   dw6 = cso.payload[3];
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs) +
-           ilo_shader_get_kernel_param(vs, param);
-   dw[2] = dw2;
-   dw[3] = 0;
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
+   /* see ds_set_gen7_3DSTATE_DS() */
+   dw[1] = kernel_offset;
+   dw[2] = 0;
+   dw[3] = ds->ds[0];
+   dw[4] = ds->ds[1];
+   dw[5] = 0;
+   dw[6] = ds->ds[2];
+   dw[7] = ds->ds[3];
+   dw[8] = ds->ds[4];
 }
 
 static inline void
-gen6_disable_3DSTATE_GS(struct ilo_builder *builder)
+gen6_3DSTATE_GS(struct ilo_builder *builder,
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 7;
    uint32_t *dw;
@@ -925,13 +633,13 @@ gen6_disable_3DSTATE_GS(struct ilo_builder *builder)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   /* honor the valid range of URB read length */
-   dw[4] = 1 << GEN6_GS_DW4_URB_READ_LEN__SHIFT;
-   dw[5] = GEN6_GS_DW5_STATISTICS;
-   dw[6] = 0;
+   dw[1] = kernel_offset;
+   /* see gs_set_gen6_3DSTATE_GS() */
+   dw[2] = gs->gs[0];
+   dw[3] = gs->gs[1];
+   dw[4] = gs->gs[2];
+   dw[5] = gs->gs[3];
+   dw[6] = gs->gs[4];
 }
 
 static inline void
@@ -960,183 +668,90 @@ gen6_3DSTATE_GS_SVB_INDEX(struct ilo_builder *builder,
 
 static inline void
 gen7_3DSTATE_GS(struct ilo_builder *builder,
-                const struct ilo_shader_state *gs)
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 7;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   cso = ilo_shader_get_kernel_cso(gs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(gs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see gs_set_gen7_3DSTATE_GS() */
+   dw[2] = gs->gs[0];
+   dw[3] = gs->gs[1];
+   dw[4] = gs->gs[2];
+   dw[5] = gs->gs[3];
    dw[6] = 0;
 }
 
 static inline void
-gen7_disable_3DSTATE_GS(struct ilo_builder *builder)
+gen8_3DSTATE_GS(struct ilo_builder *builder,
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 10 : 7;
+   const uint8_t cmd_len = 10;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = 0;
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[7] = GEN8_GS_DW7_STATISTICS;
-      dw[8] = 0;
-      dw[9] = 0;
-   } else {
-      dw[5] = GEN7_GS_DW5_STATISTICS;
-      dw[6] = 0;
-   }
+   /* see gs_set_gen7_3DSTATE_GS() */
+   dw[3] = gs->gs[0];
+   dw[4] = gs->gs[1];
+   dw[5] = 0;
+   dw[6] = gs->gs[2];
+   dw[7] = gs->gs[3];
+   dw[8] = 0;
+   dw[9] = gs->gs[4];
 }
 
 static inline void
 gen7_3DSTATE_STREAMOUT(struct ilo_builder *builder,
-                       int render_stream,
-                       bool render_disable,
-                       int vertex_attrib_count,
-                       const int *buf_strides)
+                       const struct ilo_state_sol *sol)
 {
    const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 5 : 3;
    uint32_t *dw;
-   int buf_mask;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_STREAMOUT) | (cmd_len - 2);
-
-   dw[1] = render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT;
-   if (render_disable)
-      dw[1] |= GEN7_SO_DW1_RENDER_DISABLE;
-
-   if (buf_strides) {
-      buf_mask = ((bool) buf_strides[3]) << 3 |
-                 ((bool) buf_strides[2]) << 2 |
-                 ((bool) buf_strides[1]) << 1 |
-                 ((bool) buf_strides[0]);
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         dw[3] = buf_strides[1] << 16 | buf_strides[0];
-         dw[4] = buf_strides[3] << 16 | buf_strides[1];
-      }
-   } else {
-      buf_mask = 0;
-   }
-
-   if (buf_mask) {
-      int read_len;
-
-      dw[1] |= GEN7_SO_DW1_SO_ENABLE |
-               GEN7_SO_DW1_STATISTICS;
-      /* API_OPENGL */
-      if (true)
-         dw[1] |= GEN7_SO_DW1_REORDER_TRAILING;
-      if (ilo_dev_gen(builder->dev) < ILO_GEN(8))
-         dw[1] |= buf_mask << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT;
-
-      read_len = (vertex_attrib_count + 1) / 2;
-      if (!read_len)
-         read_len = 1;
-
-      dw[2] = 0 << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT;
-   } else {
-      dw[2] = 0;
+   /* see sol_set_gen7_3DSTATE_STREAMOUT() */
+   dw[1] = sol->streamout[0];
+   dw[2] = sol->streamout[1];
+   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
+      dw[3] = sol->strides[1] << GEN8_SO_DW3_BUFFER1_PITCH__SHIFT |
+              sol->strides[0] << GEN8_SO_DW3_BUFFER0_PITCH__SHIFT;
+      dw[4] = sol->strides[3] << GEN8_SO_DW4_BUFFER3_PITCH__SHIFT |
+              sol->strides[2] << GEN8_SO_DW4_BUFFER2_PITCH__SHIFT;
    }
 }
 
 static inline void
 gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder,
-                          const struct pipe_stream_output_info *so_info)
+                          const struct ilo_state_sol *sol)
 {
    /*
     * Note that "DWord Length" has 9 bits for this command and the type of
     * cmd_len cannot be uint8_t.
     */
    uint16_t cmd_len;
-   struct {
-      int buf_selects;
-      int decl_count;
-      uint16_t decls[128];
-   } streams[4];
-   unsigned buf_offsets[PIPE_MAX_SO_BUFFERS];
-   int hw_decl_count, i;
+   int cmd_decl_count;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
-   memset(streams, 0, sizeof(streams));
-   memset(buf_offsets, 0, sizeof(buf_offsets));
-
-   for (i = 0; i < so_info->num_outputs; i++) {
-      unsigned decl, st, buf, reg, mask;
-
-      st = so_info->output[i].stream;
-      buf = so_info->output[i].output_buffer;
-
-      /* pad with holes */
-      while (buf_offsets[buf] < so_info->output[i].dst_offset) {
-         int num_dwords;
-
-         num_dwords = so_info->output[i].dst_offset - buf_offsets[buf];
-         if (num_dwords > 4)
-            num_dwords = 4;
-
-         decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
-                GEN7_SO_DECL_HOLE_FLAG |
-                ((1 << num_dwords) - 1) << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
-
-         assert(streams[st].decl_count < Elements(streams[st].decls));
-         streams[st].decls[streams[st].decl_count++] = decl;
-         buf_offsets[buf] += num_dwords;
-      }
-      assert(buf_offsets[buf] == so_info->output[i].dst_offset);
-
-      reg = so_info->output[i].register_index;
-      mask = ((1 << so_info->output[i].num_components) - 1) <<
-         so_info->output[i].start_component;
-
-      decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
-             reg << GEN7_SO_DECL_REG_INDEX__SHIFT |
-             mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
-
-      assert(streams[st].decl_count < Elements(streams[st].decls));
-
-      streams[st].buf_selects |= 1 << buf;
-      streams[st].decls[streams[st].decl_count++] = decl;
-      buf_offsets[buf] += so_info->output[i].num_components;
-   }
-
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) {
-      hw_decl_count = MAX4(streams[0].decl_count, streams[1].decl_count,
-                           streams[2].decl_count, streams[3].decl_count);
+      cmd_decl_count = sol->decl_count;
    } else {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 201:
@@ -1145,100 +760,97 @@ gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder,
        *      whenever this command is issued. The "Num Entries [n]" fields
        *      still contain the actual numbers of valid decls."
        */
-      hw_decl_count = 128;
+      cmd_decl_count = 128;
    }
 
-   cmd_len = 3 + 2 * hw_decl_count;
+   cmd_len = 3 + 2 * cmd_decl_count;
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_DECL_LIST) | (cmd_len - 2);
-   dw[1] = streams[3].buf_selects << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT |
-           streams[2].buf_selects << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT |
-           streams[1].buf_selects << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT |
-           streams[0].buf_selects << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT;
-   dw[2] = streams[3].decl_count << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT |
-           streams[2].decl_count << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT |
-           streams[1].decl_count << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT |
-           streams[0].decl_count << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT;
-   dw += 3;
-
-   for (i = 0; i < hw_decl_count; i++) {
-      dw[0] = streams[1].decls[i] << 16 | streams[0].decls[i];
-      dw[1] = streams[3].decls[i] << 16 | streams[2].decls[i];
-      dw += 2;
+   /* see sol_set_gen7_3DSTATE_SO_DECL_LIST() */
+   dw[1] = sol->so_decl[0];
+   dw[2] = sol->so_decl[1];
+   memcpy(&dw[3], sol->decl, sizeof(sol->decl[0]) * sol->decl_count);
+
+   if (sol->decl_count < cmd_decl_count) {
+      memset(&dw[3 + 2 * sol->decl_count], 0, sizeof(sol->decl[0]) *
+            cmd_decl_count - sol->decl_count);
    }
 }
 
 static inline void
-gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index, int stride,
-                       const struct pipe_stream_output_target *so_target)
+gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
+                       const struct ilo_state_sol *sol,
+                       const struct ilo_state_sol_buffer *sb,
+                       uint8_t buffer)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4;
-   struct ilo_buffer *buf;
-   int start, end;
+   const uint8_t cmd_len = 4;
    uint32_t *dw;
    unsigned pos;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   buf = ilo_buffer(so_target->buffer);
-
-   /* DWord-aligned */
-   assert(stride % 4 == 0);
-   assert(so_target->buffer_offset % 4 == 0);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   stride &= ~3;
-   start = so_target->buffer_offset & ~3;
-   end = (start + so_target->buffer_size) & ~3;
+   assert(buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT);
 
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2);
-   dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT |
-           stride;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
-
-      dw[4] = end - start;
-      dw[5] = 0;
-      dw[6] = 0;
-      dw[7] = 0;
-
-      ilo_builder_batch_reloc64(builder, pos + 2,
-            buf->bo, start, INTEL_RELOC_WRITE);
+   /* see sol_buffer_set_gen7_3dstate_so_buffer() */
+   dw[1] = buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
+           builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT |
+           sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT;
+
+   if (sb->need_bo) {
+      ilo_builder_batch_reloc(builder, pos + 2, sb->bo,
+            sb->so_buf[0], INTEL_RELOC_WRITE);
+      ilo_builder_batch_reloc(builder, pos + 3, sb->bo,
+            sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
-      dw[1] |= builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT;
-
-      ilo_builder_batch_reloc(builder, pos + 2,
-            buf->bo, start, INTEL_RELOC_WRITE);
-      ilo_builder_batch_reloc(builder, pos + 3,
-            buf->bo, end, INTEL_RELOC_WRITE);
+      dw[2] = 0;
+      dw[3] = 0;
    }
 }
 
 static inline void
-gen7_disable_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index)
+gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
+                       const struct ilo_state_sol *sol,
+                       const struct ilo_state_sol_buffer *sb,
+                       uint8_t buffer)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4;
+   const uint8_t cmd_len = 8;
    uint32_t *dw;
+   unsigned pos;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2);
-   dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see sol_buffer_set_gen8_3dstate_so_buffer() */
+   dw[1] = sb->so_buf[0] |
+           buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
+           builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
+
+   if (sb->need_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 2, sb->bo,
+            sb->so_buf[1], INTEL_RELOC_WRITE);
+   } else {
+      dw[2] = 0;
+      dw[3] = 0;
+   }
 
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[4] = 0;
+   dw[4] = sb->so_buf[2];
+
+   if (sb->need_write_offset_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo,
+            sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE);
+   } else {
       dw[5] = 0;
       dw[6] = 0;
-      dw[7] = 0;
    }
+
+   dw[7] = sb->so_buf[3];
 }
 
 static inline void
@@ -1627,8 +1239,7 @@ gen6_BINDING_TABLE_STATE(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SURFACE_STATE(struct ilo_builder *builder,
-                   const struct ilo_view_surface *surf,
-                   bool for_render)
+                   const struct ilo_state_surface *surf)
 {
    int state_align, state_len;
    uint32_t state_offset, *dw;
@@ -1641,7 +1252,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 
       state_offset = ilo_builder_surface_pointer(builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
-      memcpy(dw, surf->payload, state_len << 2);
+      memcpy(dw, surf->surface, state_len << 2);
 
       if (surf->bo) {
          const uint32_t mocs = (surf->scanout) ?
@@ -1650,7 +1261,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
          dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT;
 
          ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo,
-               surf->payload[8], (for_render) ? INTEL_RELOC_WRITE : 0);
+               surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       state_align = 32;
@@ -1658,7 +1269,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 
       state_offset = ilo_builder_surface_pointer(builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
-      memcpy(dw, surf->payload, state_len << 2);
+      memcpy(dw, surf->surface, state_len << 2);
 
       if (surf->bo) {
          /*
@@ -1668,7 +1279,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
          dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT;
 
          ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo,
-               surf->payload[1], (for_render) ? INTEL_RELOC_WRITE : 0);
+               surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 
@@ -1676,55 +1287,13 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 }
 
 static inline uint32_t
-gen6_so_SURFACE_STATE(struct ilo_builder *builder,
-                      const struct pipe_stream_output_target *so,
-                      const struct pipe_stream_output_info *so_info,
-                      int so_index)
-{
-   struct ilo_buffer *buf = ilo_buffer(so->buffer);
-   unsigned bo_offset, struct_size;
-   enum pipe_format elem_format;
-   struct ilo_view_surface surf;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   bo_offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
-   struct_size = so_info->stride[so_info->output[so_index].output_buffer] * 4;
-
-   switch (so_info->output[so_index].num_components) {
-   case 1:
-      elem_format = PIPE_FORMAT_R32_FLOAT;
-      break;
-   case 2:
-      elem_format = PIPE_FORMAT_R32G32_FLOAT;
-      break;
-   case 3:
-      elem_format = PIPE_FORMAT_R32G32B32_FLOAT;
-      break;
-   case 4:
-      elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-      break;
-   default:
-      assert(!"unexpected SO components length");
-      elem_format = PIPE_FORMAT_R32_FLOAT;
-      break;
-   }
-
-   ilo_gpe_init_view_surface_for_buffer(builder->dev, buf, bo_offset,
-         so->buffer_size, struct_size, elem_format, false, true, &surf);
-
-   return gen6_SURFACE_STATE(builder, &surf, false);
-}
-
-static inline uint32_t
 gen6_SAMPLER_STATE(struct ilo_builder *builder,
-                   const struct ilo_sampler_cso * const *samplers,
-                   const struct pipe_sampler_view * const *views,
+                   const struct ilo_state_sampler *samplers,
                    const uint32_t *sampler_border_colors,
-                   int num_samplers)
+                   int sampler_count)
 {
    const int state_align = 32;
-   const int state_len = 4 * num_samplers;
+   const int state_len = 4 * sampler_count;
    uint32_t state_offset, *dw;
    int i;
 
@@ -1735,9 +1304,9 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
     *
     *     "The sampler state is stored as an array of up to 16 elements..."
     */
-   assert(num_samplers <= 16);
+   assert(sampler_count <= 16);
 
-   if (!num_samplers)
+   if (!sampler_count)
       return 0;
 
    /*
@@ -1749,86 +1318,19 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
     *
     * It also applies to other shader stages.
     */
-   ilo_builder_dynamic_pad_top(builder, 4 * (4 - (num_samplers % 4)));
+   ilo_builder_dynamic_pad_top(builder, 4 * (4 - (sampler_count % 4)));
 
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_SAMPLER, state_align, state_len, &dw);
 
-   for (i = 0; i < num_samplers; i++) {
-      const struct ilo_sampler_cso *sampler = samplers[i];
-      const struct pipe_sampler_view *view = views[i];
-      const uint32_t border_color = sampler_border_colors[i];
-      uint32_t dw_filter, dw_wrap;
-
-      /* there may be holes */
-      if (!sampler || !view) {
-         /* disabled sampler */
-         dw[0] = 1 << 31;
-         dw[1] = 0;
-         dw[2] = 0;
-         dw[3] = 0;
-         dw += 4;
-
-         continue;
-      }
-
-      /* determine filter and wrap modes */
-      switch (view->texture->target) {
-      case PIPE_TEXTURE_1D:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap_1d;
-         break;
-      case PIPE_TEXTURE_3D:
-         /*
-          * From the Sandy Bridge PRM, volume 4 part 1, page 103:
-          *
-          *     "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for
-          *      surfaces of type SURFTYPE_3D."
-          */
-         dw_filter = sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap;
-         break;
-      case PIPE_TEXTURE_CUBE:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap_cube;
-         break;
-      default:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap;
-         break;
-      }
+   for (i = 0; i < sampler_count; i++) {
+      /* see sampler_set_gen6_SAMPLER_STATE() */
+      dw[0] = samplers[i].sampler[0];
+      dw[1] = samplers[i].sampler[1];
+      dw[3] = samplers[i].sampler[2];
 
-      dw[0] = sampler->payload[0];
-      dw[1] = sampler->payload[1];
-      assert(!(border_color & 0x1f));
-      dw[2] = border_color;
-      dw[3] = sampler->payload[2];
-
-      dw[0] |= dw_filter;
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-         dw[3] |= dw_wrap;
-      }
-      else {
-         /*
-          * From the Sandy Bridge PRM, volume 4 part 1, page 21:
-          *
-          *     "[DevSNB] Errata: Incorrect behavior is observed in cases
-          *      where the min and mag mode filters are different and
-          *      SurfMinLOD is nonzero. The determination of MagMode uses the
-          *      following equation instead of the one in the above
-          *      pseudocode: MagMode = (LOD + SurfMinLOD - Base <= 0)"
-          *
-          * As a way to work around that, we set Base to
-          * view->u.tex.first_level.
-          */
-         dw[0] |= view->u.tex.first_level << 22;
-
-         dw[1] |= dw_wrap;
-      }
+      assert(!(sampler_border_colors[i] & 0x1f));
+      dw[2] = sampler_border_colors[i];
 
       dw += 4;
    }
@@ -1838,7 +1340,7 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder,
-                                const struct ilo_sampler_cso *sampler)
+                                const struct ilo_state_sampler_border *border)
 {
    const int state_align =
       (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 64 : 32;
@@ -1846,11 +1348,12 @@ gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder,
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert(Elements(sampler->payload) >= 3 + state_len);
-
-   /* see ilo_gpe_init_sampler_cso() */
+   /*
+    * see border_set_gen6_SAMPLER_BORDER_COLOR_STATE() and
+    * border_set_gen7_SAMPLER_BORDER_COLOR_STATE()
+    */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLOB,
-         state_align, state_len, &sampler->payload[3]);
+         state_align, state_len, border->color);
 }
 
 static inline uint32_t
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_decode.c b/src/gallium/drivers/ilo/core/ilo_builder_decode.c
index cedaab1559d..c5a98c91204 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_decode.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder_decode.c
@@ -319,7 +319,7 @@ writer_decode_color_calc(const struct ilo_builder *builder,
               "stencil ref %d, bf stencil ref %d\n",
 	      GEN_EXTRACT(dw, GEN6_CC_DW0_ALPHATEST) ? "FLOAT32" : "UNORM8",
 	      (bool) (dw & GEN6_CC_DW0_ROUND_DISABLE_DISABLE),
-	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL0_REF),
+	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL_REF),
 	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL1_REF));
 
    writer_dw(builder, which, item->offset, 1, "CC\n");
@@ -347,13 +347,13 @@ writer_decode_depth_stencil(const struct ilo_builder *builder,
    dw = writer_dw(builder, which, item->offset, 0, "D_S");
    ilo_printf("stencil %sable, func %d, write %sable\n",
          (dw & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) ? "en" : "dis",
-         GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL0_FUNC),
+         GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL_FUNC),
          (dw & GEN6_ZS_DW0_STENCIL_WRITE_ENABLE) ? "en" : "dis");
 
    dw = writer_dw(builder, which, item->offset, 1, "D_S");
    ilo_printf("stencil test mask 0x%x, write mask 0x%x\n",
-         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_VALUEMASK),
-         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_WRITEMASK));
+         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_TEST_MASK),
+         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_WRITE_MASK));
 
    dw = writer_dw(builder, which, item->offset, 2, "D_S");
    ilo_printf("depth test %sable, func %d, write %sable\n",
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_media.h b/src/gallium/drivers/ilo/core/ilo_builder_media.h
index 7fbe6d41635..7197104a23e 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_media.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_media.h
@@ -29,57 +29,30 @@
 #define ILO_BUILDER_MEDIA_H
 
 #include "genhw/genhw.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
+#include "ilo_state_compute.h"
 #include "ilo_builder.h"
 
-struct gen6_idrt_data {
-   const struct ilo_shader_state *cs;
-
-   uint32_t sampler_offset;
-   uint32_t binding_table_offset;
-
-   unsigned curbe_size;
-   unsigned thread_group_size;
-};
-
 static inline void
 gen6_MEDIA_VFE_STATE(struct ilo_builder *builder,
-                     unsigned curbe_alloc, bool use_slm)
+                     const struct ilo_state_compute *compute)
 {
    const uint8_t cmd_len = 8;
-   const unsigned idrt_alloc =
-      ((ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) ? 64 : 32) * 32;
-   int max_threads;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   max_threads = builder->dev->thread_count;
-
-   curbe_alloc = align(curbe_alloc, 32);
-   assert(idrt_alloc + curbe_alloc <= builder->dev->urb_size / (use_slm + 1));
+   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(MEDIA, MEDIA_VFE_STATE) | (cmd_len - 2);
-   dw[1] = 0; /* scratch */
-
-   dw[2] = (max_threads - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
-           0 << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
-           GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
-           GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-      dw[2] |= GEN7_VFE_DW2_GPGPU_MODE;
-
+   /* see compute_set_gen6_MEDIA_VFE_STATE() */
+   dw[1] = compute->vfe[0];
+   dw[2] = compute->vfe[1];
    dw[3] = 0;
-
-   dw[4] = 0 << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
-           (curbe_alloc / 32);
-
+   dw[4] = compute->vfe[2];
    dw[5] = 0;
    dw[6] = 0;
    dw[7] = 0;
@@ -194,8 +167,10 @@ gen7_GPGPU_WALKER(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder,
-                               const struct gen6_idrt_data *data,
-                               int idrt_count)
+                               const struct ilo_state_compute *compute,
+                               const uint32_t *kernel_offsets,
+                               const uint32_t *sampler_offsets,
+                               const uint32_t *binding_table_offsets)
 {
    /*
     * From the Sandy Bridge PRM, volume 2 part 2, page 34:
@@ -211,61 +186,26 @@ gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder,
     *      aligned address of the Interface Descriptor data."
     */
    const int state_align = 32;
-   const int state_len = (32 / 4) * idrt_count;
+   const int state_len = (32 / 4) * compute->idrt_count;
    uint32_t state_offset, *dw;
    int i;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_INTERFACE_DESCRIPTOR, state_align, state_len, &dw);
 
-   for (i = 0; i < idrt_count; i++) {
-      const struct gen6_idrt_data *idrt = &data[i];
-      const struct ilo_shader_state *cs = idrt->cs;
-      unsigned sampler_count, bt_size, slm_size;
-
-      sampler_count =
-         ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT);
-      assert(sampler_count <= 16);
-      sampler_count = (sampler_count + 3) / 4;
-
-      bt_size =
-         ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT);
-      if (bt_size > 31)
-         bt_size = 31;
-
-      slm_size = ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE);
-
-      assert(idrt->curbe_size / 32 <= 63);
-
-      dw[0] = ilo_shader_get_kernel_offset(idrt->cs);
+   for (i = 0; i < compute->idrt_count; i++) {
+      /* see compute_set_gen6_INTERFACE_DESCRIPTOR_DATA() */
+      dw[0] = compute->idrt[i][0] + kernel_offsets[i];
       dw[1] = 0;
-      dw[2] = idrt->sampler_offset |
-              sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
-      dw[3] = idrt->binding_table_offset |
-              bt_size << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
-
-      dw[4] = (idrt->curbe_size / 32) << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
-              0 << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-         dw[5] = GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
-
-         if (slm_size) {
-            assert(slm_size <= 64 * 1024);
-            slm_size = util_next_power_of_two((slm_size + 4095) / 4096);
-
-            dw[5] |= GEN7_IDRT_DW5_BARRIER_ENABLE |
-                     slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT |
-                     idrt->thread_group_size <<
-                        GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
-         }
-      } else {
-         dw[5] = 0;
-      }
-
-      dw[6] = 0;
+      dw[2] = compute->idrt[i][1] |
+              sampler_offsets[i];
+      dw[3] = compute->idrt[i][2] |
+              binding_table_offsets[i];
+      dw[4] = compute->idrt[i][3];
+      dw[5] = compute->idrt[i][4];
+      dw[6] = compute->idrt[i][5];
       dw[7] = 0;
 
       dw += 8;
diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index 3587d3930f3..0a7f7d9d3fe 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -40,7 +40,4 @@
 #include "util/u_memory.h"
 #include "util/u_pointer.h"
 
-#define ILO_PRIM_RECTANGLES PIPE_PRIM_MAX
-#define ILO_PRIM_MAX (PIPE_PRIM_MAX + 1)
-
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h
index d9c460498ff..9833233d796 100644
--- a/src/gallium/drivers/ilo/core/ilo_debug.h
+++ b/src/gallium/drivers/ilo/core/ilo_debug.h
@@ -100,4 +100,21 @@ ilo_warn(const char *format, ...)
 #endif
 }
 
+static inline bool
+ilo_is_zeroed(const void *ptr, size_t size)
+{
+#ifdef DEBUG
+   size_t i;
+
+   for (i = 0; i < size; i++) {
+      if (*((const char *) ptr) != 0)
+         return false;
+   }
+
+   return true;
+#else
+   return true;
+#endif
+}
+
 #endif /* ILO_DEBUG_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_dev.c b/src/gallium/drivers/ilo/core/ilo_dev.c
index 7a774fa1591..925322abba4 100644
--- a/src/gallium/drivers/ilo/core/ilo_dev.c
+++ b/src/gallium/drivers/ilo/core/ilo_dev.c
@@ -32,14 +32,15 @@
 #include "ilo_dev.h"
 
 /**
- * Initialize the \p dev from \p winsys.  \p winsys is considered owned by \p
- * dev and will be destroyed in \p ilo_dev_cleanup().
+ * Initialize the \p dev from \p winsys.
  */
 bool
 ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys)
 {
    const struct intel_winsys_info *info;
 
+   assert(ilo_is_zeroed(dev, sizeof(*dev)));
+
    info = intel_winsys_get_info(winsys);
 
    dev->winsys = winsys;
@@ -178,9 +179,3 @@ ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys)
 
    return true;
 }
-
-void
-ilo_dev_cleanup(struct ilo_dev *dev)
-{
-   intel_winsys_destroy(dev->winsys);
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_dev.h b/src/gallium/drivers/ilo/core/ilo_dev.h
index 4eb5d59dc86..a9f9b176e16 100644
--- a/src/gallium/drivers/ilo/core/ilo_dev.h
+++ b/src/gallium/drivers/ilo/core/ilo_dev.h
@@ -63,9 +63,6 @@ struct ilo_dev {
 bool
 ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys);
 
-void
-ilo_dev_cleanup(struct ilo_dev *dev);
-
 static inline int
 ilo_dev_gen(const struct ilo_dev *dev)
 {
diff --git a/src/gallium/drivers/ilo/core/ilo_fence.h b/src/gallium/drivers/ilo/core/ilo_fence.h
deleted file mode 100644
index 00d555aa95b..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_fence.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2013 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <[email protected]>
- */
-
-#ifndef ILO_FENCE_H
-#define ILO_FENCE_H
-
-#include "intel_winsys.h"
-
-#include "ilo_core.h"
-#include "ilo_dev.h"
-
-struct ilo_fence {
-   struct intel_bo *seq_bo;
-};
-
-static inline void
-ilo_fence_init(struct ilo_fence *fence, const struct ilo_dev *dev)
-{
-   /* no-op */
-}
-
-static inline void
-ilo_fence_cleanup(struct ilo_fence *fence)
-{
-   intel_bo_unref(fence->seq_bo);
-}
-
-/**
- * Set the sequence bo for waiting.  The fence is considered signaled when
- * there is no sequence bo.
- */
-static inline void
-ilo_fence_set_seq_bo(struct ilo_fence *fence, struct intel_bo *seq_bo)
-{
-   intel_bo_unref(fence->seq_bo);
-   fence->seq_bo = intel_bo_ref(seq_bo);
-}
-
-/**
- * Wait for the fence to be signaled or until \p timeout nanoseconds has
- * passed.  It will wait indefinitely when \p timeout is negative.
- */
-static inline bool
-ilo_fence_wait(struct ilo_fence *fence, int64_t timeout)
-{
-   return (!fence->seq_bo || intel_bo_wait(fence->seq_bo, timeout) == 0);
-}
-
-#endif /* ILO_FENCE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_format.c b/src/gallium/drivers/ilo/core/ilo_format.c
deleted file mode 100644
index 280e499d54a..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_format.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2013 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <[email protected]>
- */
-
-#include "genhw/genhw.h"
-#include "ilo_format.h"
-
-struct ilo_vf_cap {
-   int vertex_element;
-};
-
-struct ilo_sol_cap {
-   int buffer;
-};
-
-struct ilo_sampler_cap {
-   int sampling;
-   int filtering;
-   int shadow_map;
-   int chroma_key;
-};
-
-struct ilo_dp_cap {
-   int rt_write;
-   int rt_write_blending;
-   int typed_write;
-   int media_color_processing;
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 2 part 1, page 97-99
- *  - the Haswell PRM, volume 7, page 467-470
- */
-static const struct ilo_vf_cap ilo_vf_caps[] = {
-#define CAP(vertex_element) { ILO_GEN(vertex_element) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UNORM]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SNORM]       = CAP(  1),
-   [GEN6_FORMAT_R64G64_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SSCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_USCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SFIXED]      = CAP(7.5),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UNORM]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SNORM]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SSCALED]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_USCALED]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SFIXED]         = CAP(7.5),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R64_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SSCALED]     = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_USCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_SFIXED]            = CAP(7.5),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1),
-   [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(7.5),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R32_UNORM]                = CAP(  1),
-   [GEN6_FORMAT_R32_SNORM]                = CAP(  1),
-   [GEN6_FORMAT_R10G10B10X2_USCALED]      = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SSCALED]         = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_USCALED]         = CAP(  1),
-   [GEN6_FORMAT_R16G16_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R16G16_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32_SSCALED]              = CAP(  1),
-   [GEN6_FORMAT_R32_USCALED]              = CAP(  1),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R8G8_SSCALED]             = CAP(  1),
-   [GEN6_FORMAT_R8G8_USCALED]             = CAP(  1),
-   [GEN6_FORMAT_R16_SSCALED]              = CAP(  1),
-   [GEN6_FORMAT_R16_USCALED]              = CAP(  1),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1),
-   [GEN6_FORMAT_R8_SSCALED]               = CAP(  1),
-   [GEN6_FORMAT_R8_USCALED]               = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R64G64B64A64_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R64G64B64_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  6),
-   [GEN6_FORMAT_R16G16B16_UNORM]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_SNORM]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_SSCALED]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_USCALED]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_UINT]           = CAP(7.5),
-   [GEN6_FORMAT_R16G16B16_SINT]           = CAP(7.5),
-   [GEN6_FORMAT_R32_SFIXED]               = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SNORM]        = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_USCALED]      = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SSCALED]      = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SINT]         = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SNORM]        = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_USCALED]      = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SSCALED]      = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_UINT]         = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SINT]         = CAP(7.5),
-   [GEN6_FORMAT_R8G8B8_UINT]              = CAP(7.5),
-   [GEN6_FORMAT_R8G8B8_SINT]              = CAP(7.5),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 2 part 1, page 195
- *  - the Haswell PRM, volume 7, page 535
- */
-static const struct ilo_sol_cap ilo_sol_caps[] = {
-#define CAP(buffer) { ILO_GEN(buffer) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 4 part 1, page 84-87
- */
-static const struct ilo_sampler_cap ilo_sampler_caps[] = {
-#define CAP(sampling, filtering, shadow_map, chroma_key) \
-   { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32X32_FLOAT]       = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT]  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_L32A32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R16G16B16X16_UNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16X16_FLOAT]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_L32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_I32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_R24_UNORM_X8_TYPELESS]    = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_X24_TYPELESS_G8_UINT]     = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_L16A16_UNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_I24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_L24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_A24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_I32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_L32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_A32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8X8_UNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R9G9B9E5_SHAREDEXP]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10X2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_L16A16_FLOAT]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A8P8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_A8P8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_I16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_A16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L8A8_UNORM]               = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_I16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_A16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L8A8_UNORM_SRGB]          = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_R5G5_SNORM_B6_UNORM]      = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P8A8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_P8A8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   0, 4.5),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_I8_UNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_L8_UNORM]                 = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P4A4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A4P4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_P8_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_L8_UNORM_SRGB]            = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P8_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P4A4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_A4P4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_DXT1_RGB_SRGB]            = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_R1_UNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P2_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P2_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_BC1_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC2_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC3_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC4_UNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC5_UNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC1_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC2_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC3_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_MONO8]                    = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_DXT1_RGB]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_FXT1]                     = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC4_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC5_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_BC6H_SF16]                = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC7_UNORM]                = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC7_UNORM_SRGB]           = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC6H_UF16]                = CAP(  7,   7,   0,   0),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278
- *  - the Haswell PRM, volume 7, page 262-264
- */
-static const struct ilo_dp_cap ilo_dp_caps[] = {
-#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \
-   { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1, 4.5,   7,   6),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  0,   0,   0,   6),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   6),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1, 4.5,   7,   0),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   6),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  0,   0,   0,   6),
-   [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1, 4.5,   7,   7),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5X1_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   0,   0,   6),
-#undef CAP
-};
-
-bool
-ilo_format_support_vb(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
-   const struct ilo_vf_cap *cap = (idx >= 0 && idx < Elements(ilo_vf_caps)) ?
-      &ilo_vf_caps[idx] : NULL;
-
-   return (cap && cap->vertex_element &&
-         ilo_dev_gen(dev) >= cap->vertex_element);
-}
-
-bool
-ilo_format_support_sol(const struct ilo_dev *dev,
-                       enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT);
-   const struct ilo_sol_cap *cap = (idx >= 0 && idx < Elements(ilo_sol_caps)) ?
-      &ilo_sol_caps[idx] : NULL;
-
-   return (cap && cap->buffer && ilo_dev_gen(dev) >= cap->buffer);
-}
-
-bool
-ilo_format_support_sampler(const struct ilo_dev *dev,
-                           enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW);
-   const struct ilo_sampler_cap *cap = (idx >= 0 &&
-         idx < Elements(ilo_sampler_caps)) ? &ilo_sampler_caps[idx] : NULL;
-
-   if (!cap || !cap->sampling)
-      return false;
-
-   assert(!cap->filtering || cap->filtering >= cap->sampling);
-
-   if (util_format_is_pure_integer(format))
-      return (ilo_dev_gen(dev) >= cap->sampling);
-   else if (cap->filtering)
-      return (ilo_dev_gen(dev) >= cap->filtering);
-   else
-      return false;
-}
-
-bool
-ilo_format_support_rt(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET);
-   const struct ilo_dp_cap *cap = (idx >= 0 && idx < Elements(ilo_dp_caps)) ?
-      &ilo_dp_caps[idx] : NULL;
-
-   if (!cap || !cap->rt_write)
-      return false;
-
-   assert(!cap->rt_write_blending || cap->rt_write_blending >= cap->rt_write);
-
-   if (util_format_is_pure_integer(format))
-      return (ilo_dev_gen(dev) >= cap->rt_write);
-   else if (cap->rt_write_blending)
-      return (ilo_dev_gen(dev) >= cap->rt_write_blending);
-   else
-      return false;
-}
-
-bool
-ilo_format_support_zs(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   switch (format) {
-   case PIPE_FORMAT_Z16_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z32_FLOAT:
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      return true;
-   case PIPE_FORMAT_S8_UINT:
-      /* TODO separate stencil */
-   default:
-      return false;
-   }
-}
-
-/**
- * Translate a color (non-depth/stencil) pipe format to the matching hardware
- * format.  Return -1 on errors.
- */
-int
-ilo_format_translate_color(const struct ilo_dev *dev,
-                           enum pipe_format format)
-{
-   static const int format_mapping[PIPE_FORMAT_COUNT] = {
-      [PIPE_FORMAT_NONE]                  = 0,
-      [PIPE_FORMAT_B8G8R8A8_UNORM]        = GEN6_FORMAT_B8G8R8A8_UNORM,
-      [PIPE_FORMAT_B8G8R8X8_UNORM]        = GEN6_FORMAT_B8G8R8X8_UNORM,
-      [PIPE_FORMAT_A8R8G8B8_UNORM]        = 0,
-      [PIPE_FORMAT_X8R8G8B8_UNORM]        = 0,
-      [PIPE_FORMAT_B5G5R5A1_UNORM]        = GEN6_FORMAT_B5G5R5A1_UNORM,
-      [PIPE_FORMAT_B4G4R4A4_UNORM]        = GEN6_FORMAT_B4G4R4A4_UNORM,
-      [PIPE_FORMAT_B5G6R5_UNORM]          = GEN6_FORMAT_B5G6R5_UNORM,
-      [PIPE_FORMAT_R10G10B10A2_UNORM]     = GEN6_FORMAT_R10G10B10A2_UNORM,
-      [PIPE_FORMAT_L8_UNORM]              = GEN6_FORMAT_L8_UNORM,
-      [PIPE_FORMAT_A8_UNORM]              = GEN6_FORMAT_A8_UNORM,
-      [PIPE_FORMAT_I8_UNORM]              = GEN6_FORMAT_I8_UNORM,
-      [PIPE_FORMAT_L8A8_UNORM]            = GEN6_FORMAT_L8A8_UNORM,
-      [PIPE_FORMAT_L16_UNORM]             = GEN6_FORMAT_L16_UNORM,
-      [PIPE_FORMAT_UYVY]                  = GEN6_FORMAT_YCRCB_SWAPUVY,
-      [PIPE_FORMAT_YUYV]                  = GEN6_FORMAT_YCRCB_NORMAL,
-      [PIPE_FORMAT_Z16_UNORM]             = 0,
-      [PIPE_FORMAT_Z32_UNORM]             = 0,
-      [PIPE_FORMAT_Z32_FLOAT]             = 0,
-      [PIPE_FORMAT_Z24_UNORM_S8_UINT]     = 0,
-      [PIPE_FORMAT_S8_UINT_Z24_UNORM]     = 0,
-      [PIPE_FORMAT_Z24X8_UNORM]           = 0,
-      [PIPE_FORMAT_X8Z24_UNORM]           = 0,
-      [PIPE_FORMAT_S8_UINT]               = 0,
-      [PIPE_FORMAT_R64_FLOAT]             = GEN6_FORMAT_R64_FLOAT,
-      [PIPE_FORMAT_R64G64_FLOAT]          = GEN6_FORMAT_R64G64_FLOAT,
-      [PIPE_FORMAT_R64G64B64_FLOAT]       = GEN6_FORMAT_R64G64B64_FLOAT,
-      [PIPE_FORMAT_R64G64B64A64_FLOAT]    = GEN6_FORMAT_R64G64B64A64_FLOAT,
-      [PIPE_FORMAT_R32_FLOAT]             = GEN6_FORMAT_R32_FLOAT,
-      [PIPE_FORMAT_R32G32_FLOAT]          = GEN6_FORMAT_R32G32_FLOAT,
-      [PIPE_FORMAT_R32G32B32_FLOAT]       = GEN6_FORMAT_R32G32B32_FLOAT,
-      [PIPE_FORMAT_R32G32B32A32_FLOAT]    = GEN6_FORMAT_R32G32B32A32_FLOAT,
-      [PIPE_FORMAT_R32_UNORM]             = GEN6_FORMAT_R32_UNORM,
-      [PIPE_FORMAT_R32G32_UNORM]          = GEN6_FORMAT_R32G32_UNORM,
-      [PIPE_FORMAT_R32G32B32_UNORM]       = GEN6_FORMAT_R32G32B32_UNORM,
-      [PIPE_FORMAT_R32G32B32A32_UNORM]    = GEN6_FORMAT_R32G32B32A32_UNORM,
-      [PIPE_FORMAT_R32_USCALED]           = GEN6_FORMAT_R32_USCALED,
-      [PIPE_FORMAT_R32G32_USCALED]        = GEN6_FORMAT_R32G32_USCALED,
-      [PIPE_FORMAT_R32G32B32_USCALED]     = GEN6_FORMAT_R32G32B32_USCALED,
-      [PIPE_FORMAT_R32G32B32A32_USCALED]  = GEN6_FORMAT_R32G32B32A32_USCALED,
-      [PIPE_FORMAT_R32_SNORM]             = GEN6_FORMAT_R32_SNORM,
-      [PIPE_FORMAT_R32G32_SNORM]          = GEN6_FORMAT_R32G32_SNORM,
-      [PIPE_FORMAT_R32G32B32_SNORM]       = GEN6_FORMAT_R32G32B32_SNORM,
-      [PIPE_FORMAT_R32G32B32A32_SNORM]    = GEN6_FORMAT_R32G32B32A32_SNORM,
-      [PIPE_FORMAT_R32_SSCALED]           = GEN6_FORMAT_R32_SSCALED,
-      [PIPE_FORMAT_R32G32_SSCALED]        = GEN6_FORMAT_R32G32_SSCALED,
-      [PIPE_FORMAT_R32G32B32_SSCALED]     = GEN6_FORMAT_R32G32B32_SSCALED,
-      [PIPE_FORMAT_R32G32B32A32_SSCALED]  = GEN6_FORMAT_R32G32B32A32_SSCALED,
-      [PIPE_FORMAT_R16_UNORM]             = GEN6_FORMAT_R16_UNORM,
-      [PIPE_FORMAT_R16G16_UNORM]          = GEN6_FORMAT_R16G16_UNORM,
-      [PIPE_FORMAT_R16G16B16_UNORM]       = GEN6_FORMAT_R16G16B16_UNORM,
-      [PIPE_FORMAT_R16G16B16A16_UNORM]    = GEN6_FORMAT_R16G16B16A16_UNORM,
-      [PIPE_FORMAT_R16_USCALED]           = GEN6_FORMAT_R16_USCALED,
-      [PIPE_FORMAT_R16G16_USCALED]        = GEN6_FORMAT_R16G16_USCALED,
-      [PIPE_FORMAT_R16G16B16_USCALED]     = GEN6_FORMAT_R16G16B16_USCALED,
-      [PIPE_FORMAT_R16G16B16A16_USCALED]  = GEN6_FORMAT_R16G16B16A16_USCALED,
-      [PIPE_FORMAT_R16_SNORM]             = GEN6_FORMAT_R16_SNORM,
-      [PIPE_FORMAT_R16G16_SNORM]          = GEN6_FORMAT_R16G16_SNORM,
-      [PIPE_FORMAT_R16G16B16_SNORM]       = GEN6_FORMAT_R16G16B16_SNORM,
-      [PIPE_FORMAT_R16G16B16A16_SNORM]    = GEN6_FORMAT_R16G16B16A16_SNORM,
-      [PIPE_FORMAT_R16_SSCALED]           = GEN6_FORMAT_R16_SSCALED,
-      [PIPE_FORMAT_R16G16_SSCALED]        = GEN6_FORMAT_R16G16_SSCALED,
-      [PIPE_FORMAT_R16G16B16_SSCALED]     = GEN6_FORMAT_R16G16B16_SSCALED,
-      [PIPE_FORMAT_R16G16B16A16_SSCALED]  = GEN6_FORMAT_R16G16B16A16_SSCALED,
-      [PIPE_FORMAT_R8_UNORM]              = GEN6_FORMAT_R8_UNORM,
-      [PIPE_FORMAT_R8G8_UNORM]            = GEN6_FORMAT_R8G8_UNORM,
-      [PIPE_FORMAT_R8G8B8_UNORM]          = GEN6_FORMAT_R8G8B8_UNORM,
-      [PIPE_FORMAT_R8G8B8A8_UNORM]        = GEN6_FORMAT_R8G8B8A8_UNORM,
-      [PIPE_FORMAT_X8B8G8R8_UNORM]        = 0,
-      [PIPE_FORMAT_R8_USCALED]            = GEN6_FORMAT_R8_USCALED,
-      [PIPE_FORMAT_R8G8_USCALED]          = GEN6_FORMAT_R8G8_USCALED,
-      [PIPE_FORMAT_R8G8B8_USCALED]        = GEN6_FORMAT_R8G8B8_USCALED,
-      [PIPE_FORMAT_R8G8B8A8_USCALED]      = GEN6_FORMAT_R8G8B8A8_USCALED,
-      [PIPE_FORMAT_R8_SNORM]              = GEN6_FORMAT_R8_SNORM,
-      [PIPE_FORMAT_R8G8_SNORM]            = GEN6_FORMAT_R8G8_SNORM,
-      [PIPE_FORMAT_R8G8B8_SNORM]          = GEN6_FORMAT_R8G8B8_SNORM,
-      [PIPE_FORMAT_R8G8B8A8_SNORM]        = GEN6_FORMAT_R8G8B8A8_SNORM,
-      [PIPE_FORMAT_R8_SSCALED]            = GEN6_FORMAT_R8_SSCALED,
-      [PIPE_FORMAT_R8G8_SSCALED]          = GEN6_FORMAT_R8G8_SSCALED,
-      [PIPE_FORMAT_R8G8B8_SSCALED]        = GEN6_FORMAT_R8G8B8_SSCALED,
-      [PIPE_FORMAT_R8G8B8A8_SSCALED]      = GEN6_FORMAT_R8G8B8A8_SSCALED,
-      [PIPE_FORMAT_R32_FIXED]             = GEN6_FORMAT_R32_SFIXED,
-      [PIPE_FORMAT_R32G32_FIXED]          = GEN6_FORMAT_R32G32_SFIXED,
-      [PIPE_FORMAT_R32G32B32_FIXED]       = GEN6_FORMAT_R32G32B32_SFIXED,
-      [PIPE_FORMAT_R32G32B32A32_FIXED]    = GEN6_FORMAT_R32G32B32A32_SFIXED,
-      [PIPE_FORMAT_R16_FLOAT]             = GEN6_FORMAT_R16_FLOAT,
-      [PIPE_FORMAT_R16G16_FLOAT]          = GEN6_FORMAT_R16G16_FLOAT,
-      [PIPE_FORMAT_R16G16B16_FLOAT]       = GEN6_FORMAT_R16G16B16_FLOAT,
-      [PIPE_FORMAT_R16G16B16A16_FLOAT]    = GEN6_FORMAT_R16G16B16A16_FLOAT,
-      [PIPE_FORMAT_L8_SRGB]               = GEN6_FORMAT_L8_UNORM_SRGB,
-      [PIPE_FORMAT_L8A8_SRGB]             = GEN6_FORMAT_L8A8_UNORM_SRGB,
-      [PIPE_FORMAT_R8G8B8_SRGB]           = GEN6_FORMAT_R8G8B8_UNORM_SRGB,
-      [PIPE_FORMAT_A8B8G8R8_SRGB]         = 0,
-      [PIPE_FORMAT_X8B8G8R8_SRGB]         = 0,
-      [PIPE_FORMAT_B8G8R8A8_SRGB]         = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB,
-      [PIPE_FORMAT_B8G8R8X8_SRGB]         = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB,
-      [PIPE_FORMAT_A8R8G8B8_SRGB]         = 0,
-      [PIPE_FORMAT_X8R8G8B8_SRGB]         = 0,
-      [PIPE_FORMAT_R8G8B8A8_SRGB]         = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB,
-      [PIPE_FORMAT_DXT1_RGB]              = GEN6_FORMAT_DXT1_RGB,
-      [PIPE_FORMAT_DXT1_RGBA]             = GEN6_FORMAT_BC1_UNORM,
-      [PIPE_FORMAT_DXT3_RGBA]             = GEN6_FORMAT_BC2_UNORM,
-      [PIPE_FORMAT_DXT5_RGBA]             = GEN6_FORMAT_BC3_UNORM,
-      [PIPE_FORMAT_DXT1_SRGB]             = GEN6_FORMAT_DXT1_RGB_SRGB,
-      [PIPE_FORMAT_DXT1_SRGBA]            = GEN6_FORMAT_BC1_UNORM_SRGB,
-      [PIPE_FORMAT_DXT3_SRGBA]            = GEN6_FORMAT_BC2_UNORM_SRGB,
-      [PIPE_FORMAT_DXT5_SRGBA]            = GEN6_FORMAT_BC3_UNORM_SRGB,
-      [PIPE_FORMAT_RGTC1_UNORM]           = GEN6_FORMAT_BC4_UNORM,
-      [PIPE_FORMAT_RGTC1_SNORM]           = GEN6_FORMAT_BC4_SNORM,
-      [PIPE_FORMAT_RGTC2_UNORM]           = GEN6_FORMAT_BC5_UNORM,
-      [PIPE_FORMAT_RGTC2_SNORM]           = GEN6_FORMAT_BC5_SNORM,
-      [PIPE_FORMAT_R8G8_B8G8_UNORM]       = 0,
-      [PIPE_FORMAT_G8R8_G8B8_UNORM]       = 0,
-      [PIPE_FORMAT_R8SG8SB8UX8U_NORM]     = 0,
-      [PIPE_FORMAT_R5SG5SB6U_NORM]        = 0,
-      [PIPE_FORMAT_A8B8G8R8_UNORM]        = 0,
-      [PIPE_FORMAT_B5G5R5X1_UNORM]        = GEN6_FORMAT_B5G5R5X1_UNORM,
-      [PIPE_FORMAT_R10G10B10A2_USCALED]   = GEN6_FORMAT_R10G10B10A2_USCALED,
-      [PIPE_FORMAT_R11G11B10_FLOAT]       = GEN6_FORMAT_R11G11B10_FLOAT,
-      [PIPE_FORMAT_R9G9B9E5_FLOAT]        = GEN6_FORMAT_R9G9B9E5_SHAREDEXP,
-      [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT]  = 0,
-      [PIPE_FORMAT_R1_UNORM]              = GEN6_FORMAT_R1_UNORM,
-      [PIPE_FORMAT_R10G10B10X2_USCALED]   = GEN6_FORMAT_R10G10B10X2_USCALED,
-      [PIPE_FORMAT_R10G10B10X2_SNORM]     = 0,
-      [PIPE_FORMAT_L4A4_UNORM]            = 0,
-      [PIPE_FORMAT_B10G10R10A2_UNORM]     = GEN6_FORMAT_B10G10R10A2_UNORM,
-      [PIPE_FORMAT_R10SG10SB10SA2U_NORM]  = 0,
-      [PIPE_FORMAT_R8G8Bx_SNORM]          = 0,
-      [PIPE_FORMAT_R8G8B8X8_UNORM]        = GEN6_FORMAT_R8G8B8X8_UNORM,
-      [PIPE_FORMAT_B4G4R4X4_UNORM]        = 0,
-      [PIPE_FORMAT_X24S8_UINT]            = 0,
-      [PIPE_FORMAT_S8X24_UINT]            = 0,
-      [PIPE_FORMAT_X32_S8X24_UINT]        = 0,
-      [PIPE_FORMAT_B2G3R3_UNORM]          = 0,
-      [PIPE_FORMAT_L16A16_UNORM]          = GEN6_FORMAT_L16A16_UNORM,
-      [PIPE_FORMAT_A16_UNORM]             = GEN6_FORMAT_A16_UNORM,
-      [PIPE_FORMAT_I16_UNORM]             = GEN6_FORMAT_I16_UNORM,
-      [PIPE_FORMAT_LATC1_UNORM]           = 0,
-      [PIPE_FORMAT_LATC1_SNORM]           = 0,
-      [PIPE_FORMAT_LATC2_UNORM]           = 0,
-      [PIPE_FORMAT_LATC2_SNORM]           = 0,
-      [PIPE_FORMAT_A8_SNORM]              = 0,
-      [PIPE_FORMAT_L8_SNORM]              = 0,
-      [PIPE_FORMAT_L8A8_SNORM]            = 0,
-      [PIPE_FORMAT_I8_SNORM]              = 0,
-      [PIPE_FORMAT_A16_SNORM]             = 0,
-      [PIPE_FORMAT_L16_SNORM]             = 0,
-      [PIPE_FORMAT_L16A16_SNORM]          = 0,
-      [PIPE_FORMAT_I16_SNORM]             = 0,
-      [PIPE_FORMAT_A16_FLOAT]             = GEN6_FORMAT_A16_FLOAT,
-      [PIPE_FORMAT_L16_FLOAT]             = GEN6_FORMAT_L16_FLOAT,
-      [PIPE_FORMAT_L16A16_FLOAT]          = GEN6_FORMAT_L16A16_FLOAT,
-      [PIPE_FORMAT_I16_FLOAT]             = GEN6_FORMAT_I16_FLOAT,
-      [PIPE_FORMAT_A32_FLOAT]             = GEN6_FORMAT_A32_FLOAT,
-      [PIPE_FORMAT_L32_FLOAT]             = GEN6_FORMAT_L32_FLOAT,
-      [PIPE_FORMAT_L32A32_FLOAT]          = GEN6_FORMAT_L32A32_FLOAT,
-      [PIPE_FORMAT_I32_FLOAT]             = GEN6_FORMAT_I32_FLOAT,
-      [PIPE_FORMAT_YV12]                  = 0,
-      [PIPE_FORMAT_YV16]                  = 0,
-      [PIPE_FORMAT_IYUV]                  = 0,
-      [PIPE_FORMAT_NV12]                  = 0,
-      [PIPE_FORMAT_NV21]                  = 0,
-      [PIPE_FORMAT_A4R4_UNORM]            = 0,
-      [PIPE_FORMAT_R4A4_UNORM]            = 0,
-      [PIPE_FORMAT_R8A8_UNORM]            = 0,
-      [PIPE_FORMAT_A8R8_UNORM]            = 0,
-      [PIPE_FORMAT_R10G10B10A2_SSCALED]   = GEN6_FORMAT_R10G10B10A2_SSCALED,
-      [PIPE_FORMAT_R10G10B10A2_SNORM]     = GEN6_FORMAT_R10G10B10A2_SNORM,
-      [PIPE_FORMAT_B10G10R10A2_USCALED]   = GEN6_FORMAT_B10G10R10A2_USCALED,
-      [PIPE_FORMAT_B10G10R10A2_SSCALED]   = GEN6_FORMAT_B10G10R10A2_SSCALED,
-      [PIPE_FORMAT_B10G10R10A2_SNORM]     = GEN6_FORMAT_B10G10R10A2_SNORM,
-      [PIPE_FORMAT_R8_UINT]               = GEN6_FORMAT_R8_UINT,
-      [PIPE_FORMAT_R8G8_UINT]             = GEN6_FORMAT_R8G8_UINT,
-      [PIPE_FORMAT_R8G8B8_UINT]           = GEN6_FORMAT_R8G8B8_UINT,
-      [PIPE_FORMAT_R8G8B8A8_UINT]         = GEN6_FORMAT_R8G8B8A8_UINT,
-      [PIPE_FORMAT_R8_SINT]               = GEN6_FORMAT_R8_SINT,
-      [PIPE_FORMAT_R8G8_SINT]             = GEN6_FORMAT_R8G8_SINT,
-      [PIPE_FORMAT_R8G8B8_SINT]           = GEN6_FORMAT_R8G8B8_SINT,
-      [PIPE_FORMAT_R8G8B8A8_SINT]         = GEN6_FORMAT_R8G8B8A8_SINT,
-      [PIPE_FORMAT_R16_UINT]              = GEN6_FORMAT_R16_UINT,
-      [PIPE_FORMAT_R16G16_UINT]           = GEN6_FORMAT_R16G16_UINT,
-      [PIPE_FORMAT_R16G16B16_UINT]        = GEN6_FORMAT_R16G16B16_UINT,
-      [PIPE_FORMAT_R16G16B16A16_UINT]     = GEN6_FORMAT_R16G16B16A16_UINT,
-      [PIPE_FORMAT_R16_SINT]              = GEN6_FORMAT_R16_SINT,
-      [PIPE_FORMAT_R16G16_SINT]           = GEN6_FORMAT_R16G16_SINT,
-      [PIPE_FORMAT_R16G16B16_SINT]        = GEN6_FORMAT_R16G16B16_SINT,
-      [PIPE_FORMAT_R16G16B16A16_SINT]     = GEN6_FORMAT_R16G16B16A16_SINT,
-      [PIPE_FORMAT_R32_UINT]              = GEN6_FORMAT_R32_UINT,
-      [PIPE_FORMAT_R32G32_UINT]           = GEN6_FORMAT_R32G32_UINT,
-      [PIPE_FORMAT_R32G32B32_UINT]        = GEN6_FORMAT_R32G32B32_UINT,
-      [PIPE_FORMAT_R32G32B32A32_UINT]     = GEN6_FORMAT_R32G32B32A32_UINT,
-      [PIPE_FORMAT_R32_SINT]              = GEN6_FORMAT_R32_SINT,
-      [PIPE_FORMAT_R32G32_SINT]           = GEN6_FORMAT_R32G32_SINT,
-      [PIPE_FORMAT_R32G32B32_SINT]        = GEN6_FORMAT_R32G32B32_SINT,
-      [PIPE_FORMAT_R32G32B32A32_SINT]     = GEN6_FORMAT_R32G32B32A32_SINT,
-      [PIPE_FORMAT_A8_UINT]               = 0,
-      [PIPE_FORMAT_I8_UINT]               = GEN6_FORMAT_I8_UINT,
-      [PIPE_FORMAT_L8_UINT]               = GEN6_FORMAT_L8_UINT,
-      [PIPE_FORMAT_L8A8_UINT]             = GEN6_FORMAT_L8A8_UINT,
-      [PIPE_FORMAT_A8_SINT]               = 0,
-      [PIPE_FORMAT_I8_SINT]               = GEN6_FORMAT_I8_SINT,
-      [PIPE_FORMAT_L8_SINT]               = GEN6_FORMAT_L8_SINT,
-      [PIPE_FORMAT_L8A8_SINT]             = GEN6_FORMAT_L8A8_SINT,
-      [PIPE_FORMAT_A16_UINT]              = 0,
-      [PIPE_FORMAT_I16_UINT]              = 0,
-      [PIPE_FORMAT_L16_UINT]              = 0,
-      [PIPE_FORMAT_L16A16_UINT]           = 0,
-      [PIPE_FORMAT_A16_SINT]              = 0,
-      [PIPE_FORMAT_I16_SINT]              = 0,
-      [PIPE_FORMAT_L16_SINT]              = 0,
-      [PIPE_FORMAT_L16A16_SINT]           = 0,
-      [PIPE_FORMAT_A32_UINT]              = 0,
-      [PIPE_FORMAT_I32_UINT]              = 0,
-      [PIPE_FORMAT_L32_UINT]              = 0,
-      [PIPE_FORMAT_L32A32_UINT]           = 0,
-      [PIPE_FORMAT_A32_SINT]              = 0,
-      [PIPE_FORMAT_I32_SINT]              = 0,
-      [PIPE_FORMAT_L32_SINT]              = 0,
-      [PIPE_FORMAT_L32A32_SINT]           = 0,
-      [PIPE_FORMAT_B10G10R10A2_UINT]      = GEN6_FORMAT_B10G10R10A2_UINT,
-      [PIPE_FORMAT_ETC1_RGB8]             = GEN6_FORMAT_ETC1_RGB8,
-      [PIPE_FORMAT_R8G8_R8B8_UNORM]       = 0,
-      [PIPE_FORMAT_G8R8_B8R8_UNORM]       = 0,
-      [PIPE_FORMAT_R8G8B8X8_SNORM]        = 0,
-      [PIPE_FORMAT_R8G8B8X8_SRGB]         = 0,
-      [PIPE_FORMAT_R8G8B8X8_UINT]         = 0,
-      [PIPE_FORMAT_R8G8B8X8_SINT]         = 0,
-      [PIPE_FORMAT_B10G10R10X2_UNORM]     = GEN6_FORMAT_B10G10R10X2_UNORM,
-      [PIPE_FORMAT_R16G16B16X16_UNORM]    = GEN6_FORMAT_R16G16B16X16_UNORM,
-      [PIPE_FORMAT_R16G16B16X16_SNORM]    = 0,
-      [PIPE_FORMAT_R16G16B16X16_FLOAT]    = GEN6_FORMAT_R16G16B16X16_FLOAT,
-      [PIPE_FORMAT_R16G16B16X16_UINT]     = 0,
-      [PIPE_FORMAT_R16G16B16X16_SINT]     = 0,
-      [PIPE_FORMAT_R32G32B32X32_FLOAT]    = GEN6_FORMAT_R32G32B32X32_FLOAT,
-      [PIPE_FORMAT_R32G32B32X32_UINT]     = 0,
-      [PIPE_FORMAT_R32G32B32X32_SINT]     = 0,
-      [PIPE_FORMAT_R8A8_SNORM]            = 0,
-      [PIPE_FORMAT_R16A16_UNORM]          = 0,
-      [PIPE_FORMAT_R16A16_SNORM]          = 0,
-      [PIPE_FORMAT_R16A16_FLOAT]          = 0,
-      [PIPE_FORMAT_R32A32_FLOAT]          = 0,
-      [PIPE_FORMAT_R8A8_UINT]             = 0,
-      [PIPE_FORMAT_R8A8_SINT]             = 0,
-      [PIPE_FORMAT_R16A16_UINT]           = 0,
-      [PIPE_FORMAT_R16A16_SINT]           = 0,
-      [PIPE_FORMAT_R32A32_UINT]           = 0,
-      [PIPE_FORMAT_R32A32_SINT]           = 0,
-      [PIPE_FORMAT_R10G10B10A2_UINT]      = GEN6_FORMAT_R10G10B10A2_UINT,
-      [PIPE_FORMAT_B5G6R5_SRGB]           = GEN6_FORMAT_B5G6R5_UNORM_SRGB,
-   };
-   int sfmt = format_mapping[format];
-
-   /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */
-   if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT)
-      sfmt = -1;
-
-   return sfmt;
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 22c8ef2620a..0d837d8a9d5 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -675,9 +675,12 @@ img_init_size_and_format(struct ilo_image *img,
    enum pipe_format format = templ->format;
    bool require_separate_stencil = false;
 
+   img->target = templ->target;
    img->width0 = templ->width0;
    img->height0 = templ->height0;
    img->depth0 = templ->depth0;
+   img->array_size = templ->array_size;
+   img->level_count = templ->last_level + 1;
    img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
 
    /*
@@ -794,6 +797,10 @@ img_want_hiz(const struct ilo_image *img,
    if (ilo_debug & ILO_DEBUG_NOHIZ)
       return false;
 
+   /* we want 8x4 aligned levels */
+   if (templ->target == PIPE_TEXTURE_1D)
+      return false;
+
    if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL))
       return false;
 
@@ -1343,9 +1350,12 @@ img_init_for_transfer(struct ilo_image *img,
 
    img->aux.type = ILO_IMAGE_AUX_NONE;
 
+   img->target = templ->target;
    img->width0 = templ->width0;
    img->height0 = templ->height0;
    img->depth0 = templ->depth0;
+   img->array_size = templ->array_size;
+   img->level_count = 1;
    img->sample_count = 1;
 
    img->format = templ->format;
@@ -1386,6 +1396,8 @@ void ilo_image_init(struct ilo_image *img,
    struct ilo_image_params params;
    bool transfer_only;
 
+   assert(ilo_is_zeroed(img, sizeof(*img)));
+
    /* use transfer layout when the texture is never bound to GPU */
    transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE |
                                      PIPE_BIND_TRANSFER_READ));
@@ -1411,6 +1423,8 @@ ilo_image_init_for_imported(struct ilo_image *img,
 {
    struct ilo_image_params params;
 
+   assert(ilo_is_zeroed(img, sizeof(*img)));
+
    if ((tiling == GEN6_TILING_X && bo_stride % 512) ||
        (tiling == GEN6_TILING_Y && bo_stride % 128) ||
        (tiling == GEN8_TILING_W && bo_stride % 64))
@@ -1435,3 +1449,22 @@ ilo_image_init_for_imported(struct ilo_image *img,
 
    return true;
 }
+
+bool
+ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev)
+{
+   /* HiZ is required for separate stencil on Gen6 */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
+       img->aux.type == ILO_IMAGE_AUX_HIZ &&
+       img->separate_stencil)
+      return false;
+
+   /* MCS is required for multisample images */
+   if (img->aux.type == ILO_IMAGE_AUX_MCS &&
+       img->sample_count > 1)
+      return false;
+
+   img->aux.enables = 0x0;
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 4956bdae2ee..af15e856028 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -88,10 +88,14 @@ struct ilo_image_lod {
  * Texture layout.
  */
 struct ilo_image {
+   enum pipe_texture_target target;
+
    /* size, format, etc for programming hardware states */
    unsigned width0;
    unsigned height0;
    unsigned depth0;
+   unsigned array_size;
+   unsigned level_count;
    unsigned sample_count;
    enum pipe_format format;
    bool separate_stencil;
@@ -125,8 +129,6 @@ struct ilo_image {
 
    bool scanout;
 
-   struct intel_bo *bo;
-
    struct {
       enum ilo_image_aux_type type;
 
@@ -140,8 +142,12 @@ struct ilo_image {
       unsigned bo_stride;
       unsigned bo_height;
 
+      /* managed by users */
       struct intel_bo *bo;
    } aux;
+
+   /* managed by users */
+   struct intel_bo *bo;
 };
 
 struct pipe_resource;
@@ -158,31 +164,13 @@ ilo_image_init_for_imported(struct ilo_image *img,
                             enum gen_surface_tiling tiling,
                             unsigned bo_stride);
 
-static inline void
-ilo_image_cleanup(struct ilo_image *img)
-{
-   intel_bo_unref(img->bo);
-   intel_bo_unref(img->aux.bo);
-}
-
-static inline void
-ilo_image_set_bo(struct ilo_image *img, struct intel_bo *bo)
-{
-   intel_bo_unref(img->bo);
-   img->bo = intel_bo_ref(bo);
-}
-
-static inline void
-ilo_image_set_aux_bo(struct ilo_image *img, struct intel_bo *bo)
-{
-   intel_bo_unref(img->aux.bo);
-   img->aux.bo = intel_bo_ref(bo);
-}
+bool
+ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev);
 
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
 {
-   return (img->aux.bo && (img->aux.enables & (1 << level)));
+   return (img->aux.enables & (1 << level));
 }
 
 /**
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d.h b/src/gallium/drivers/ilo/core/ilo_state_3d.h
deleted file mode 100644
index fdce445f733..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d.h
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <[email protected]>
- */
-
-#ifndef ILO_STATE_3D_H
-#define ILO_STATE_3D_H
-
-#include "genhw/genhw.h"
-#include "pipe/p_state.h"
-
-#include "ilo_core.h"
-#include "ilo_dev.h"
-
-/**
- * \see brw_context.h
- */
-#define ILO_MAX_DRAW_BUFFERS    8
-#define ILO_MAX_CONST_BUFFERS   (1 + 12)
-#define ILO_MAX_SAMPLER_VIEWS   16
-#define ILO_MAX_SAMPLERS        16
-#define ILO_MAX_SO_BINDINGS     64
-#define ILO_MAX_SO_BUFFERS      4
-#define ILO_MAX_VIEWPORTS       1
-
-#define ILO_MAX_SURFACES        256
-
-struct intel_bo;
-struct ilo_buffer;
-struct ilo_image;
-struct ilo_shader_state;
-
-struct ilo_vb_state {
-   struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS];
-   uint32_t enabled_mask;
-};
-
-struct ilo_ib_state {
-   struct pipe_resource *buffer;
-   const void *user_buffer;
-   unsigned offset;
-   unsigned index_size;
-
-   /* these are not valid until the state is finalized */
-   struct pipe_resource *hw_resource;
-   unsigned hw_index_size;
-   /* an offset to be added to pipe_draw_info::start */
-   int64_t draw_start_offset;
-};
-
-struct ilo_ve_cso {
-   /* VERTEX_ELEMENT_STATE */
-   uint32_t payload[2];
-};
-
-struct ilo_ve_state {
-   struct ilo_ve_cso cso[PIPE_MAX_ATTRIBS];
-   unsigned count;
-
-   unsigned instance_divisors[PIPE_MAX_ATTRIBS];
-   unsigned vb_mapping[PIPE_MAX_ATTRIBS];
-   unsigned vb_count;
-
-   /* these are not valid until the state is finalized */
-   struct ilo_ve_cso edgeflag_cso;
-   bool last_cso_edgeflag;
-
-   struct ilo_ve_cso nosrc_cso;
-   bool prepend_nosrc_cso;
-};
-
-struct ilo_so_state {
-   struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS];
-   unsigned count;
-   unsigned append_bitmask;
-
-   bool enabled;
-};
-
-struct ilo_viewport_cso {
-   /* matrix form */
-   float m00, m11, m22, m30, m31, m32;
-
-   /* guardband in NDC space */
-   float min_gbx, min_gby, max_gbx, max_gby;
-
-   /* viewport in screen space */
-   float min_x, min_y, min_z;
-   float max_x, max_y, max_z;
-};
-
-struct ilo_viewport_state {
-   struct ilo_viewport_cso cso[ILO_MAX_VIEWPORTS];
-   unsigned count;
-
-   struct pipe_viewport_state viewport0;
-};
-
-struct ilo_scissor_state {
-   /* SCISSOR_RECT */
-   uint32_t payload[ILO_MAX_VIEWPORTS * 2];
-
-   struct pipe_scissor_state scissor0;
-};
-
-struct ilo_rasterizer_clip {
-   /* 3DSTATE_CLIP */
-   uint32_t payload[3];
-
-   uint32_t can_enable_guardband;
-};
-
-struct ilo_rasterizer_sf {
-   /* 3DSTATE_SF */
-   uint32_t payload[3];
-   uint32_t dw_msaa;
-
-   /* Global Depth Offset Constant/Scale/Clamp */
-   uint32_t dw_depth_offset_const;
-   uint32_t dw_depth_offset_scale;
-   uint32_t dw_depth_offset_clamp;
-
-   /* Gen8+ 3DSTATE_RASTER */
-   uint32_t dw_raster;
-};
-
-struct ilo_rasterizer_wm {
-   /* 3DSTATE_WM */
-   uint32_t payload[2];
-   uint32_t dw_msaa_rast;
-   uint32_t dw_msaa_disp;
-};
-
-struct ilo_rasterizer_state {
-   struct pipe_rasterizer_state state;
-
-   struct ilo_rasterizer_clip clip;
-   struct ilo_rasterizer_sf sf;
-   struct ilo_rasterizer_wm wm;
-};
-
-struct ilo_dsa_state {
-   /* DEPTH_STENCIL_STATE or Gen8+ 3DSTATE_WM_DEPTH_STENCIL */
-   uint32_t payload[3];
-
-   uint32_t dw_blend_alpha;
-   uint32_t dw_ps_blend_alpha;
-   ubyte alpha_ref;
-};
-
-struct ilo_blend_cso {
-   /* BLEND_STATE */
-   uint32_t payload[2];
-
-   uint32_t dw_blend;
-   uint32_t dw_blend_dst_alpha_forced_one;
-};
-
-struct ilo_blend_state {
-   struct ilo_blend_cso cso[ILO_MAX_DRAW_BUFFERS];
-
-   bool dual_blend;
-   bool alpha_to_coverage;
-
-   uint32_t dw_shared;
-   uint32_t dw_alpha_mod;
-   uint32_t dw_logicop;
-
-   /* a part of 3DSTATE_PS_BLEND */
-   uint32_t dw_ps_blend;
-   uint32_t dw_ps_blend_dst_alpha_forced_one;
-};
-
-struct ilo_sampler_cso {
-   /* SAMPLER_STATE and SAMPLER_BORDER_COLOR_STATE */
-   uint32_t payload[15];
-
-   uint32_t dw_filter;
-   uint32_t dw_filter_aniso;
-   uint32_t dw_wrap;
-   uint32_t dw_wrap_1d;
-   uint32_t dw_wrap_cube;
-
-   bool anisotropic;
-   bool saturate_r;
-   bool saturate_s;
-   bool saturate_t;
-};
-
-struct ilo_sampler_state {
-   const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS];
-};
-
-struct ilo_view_surface {
-   /* SURFACE_STATE */
-   uint32_t payload[13];
-   struct intel_bo *bo;
-
-   uint32_t scanout;
-};
-
-struct ilo_view_cso {
-   struct pipe_sampler_view base;
-
-   struct ilo_view_surface surface;
-};
-
-struct ilo_view_state {
-   struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS];
-   unsigned count;
-};
-
-struct ilo_cbuf_cso {
-   struct pipe_resource *resource;
-   struct ilo_view_surface surface;
-
-   /*
-    * this CSO is not so constant because user buffer needs to be uploaded in
-    * finalize_constant_buffers()
-    */
-   const void *user_buffer;
-   unsigned user_buffer_size;
-};
-
-struct ilo_cbuf_state {
-   struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS];
-   uint32_t enabled_mask;
-};
-
-struct ilo_resource_state {
-   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
-   unsigned count;
-};
-
-struct ilo_surface_cso {
-   struct pipe_surface base;
-
-   bool is_rt;
-   union {
-      struct ilo_view_surface rt;
-      struct ilo_zs_surface {
-         uint32_t payload[12];
-         uint32_t dw_aligned_8x4;
-
-         struct intel_bo *bo;
-         struct intel_bo *hiz_bo;
-         struct intel_bo *separate_s8_bo;
-      } zs;
-   } u;
-};
-
-struct ilo_fb_state {
-   struct pipe_framebuffer_state state;
-
-   struct ilo_view_surface null_rt;
-   struct ilo_zs_surface null_zs;
-
-   struct ilo_fb_blend_caps {
-      bool can_logicop;
-      bool can_blend;
-      bool can_alpha_test;
-      bool dst_alpha_forced_one;
-   } blend_caps[PIPE_MAX_COLOR_BUFS];
-
-   unsigned num_samples;
-};
-
-struct ilo_shader_cso {
-   uint32_t payload[5];
-};
-
-/**
- * Translate a pipe texture target to the matching hardware surface type.
- */
-static inline int
-ilo_gpe_gen6_translate_texture(enum pipe_texture_target target)
-{
-   switch (target) {
-   case PIPE_BUFFER:
-      return GEN6_SURFTYPE_BUFFER;
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_CUBE;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_BUFFER;
-   }
-}
-
-void
-ilo_gpe_init_ve(const struct ilo_dev *dev,
-                unsigned num_states,
-                const struct pipe_vertex_element *states,
-                struct ilo_ve_state *ve);
-
-void
-ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev,
-                        struct ilo_ve_cso *cso);
-
-void
-ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev,
-                      int comp0, int comp1, int comp2, int comp3,
-                      struct ilo_ve_cso *cso);
-
-void
-ilo_gpe_set_viewport_cso(const struct ilo_dev *dev,
-                         const struct pipe_viewport_state *state,
-                         struct ilo_viewport_cso *vp);
-
-void
-ilo_gpe_set_scissor(const struct ilo_dev *dev,
-                    unsigned start_slot,
-                    unsigned num_states,
-                    const struct pipe_scissor_state *states,
-                    struct ilo_scissor_state *scissor);
-
-void
-ilo_gpe_set_scissor_null(const struct ilo_dev *dev,
-                         struct ilo_scissor_state *scissor);
-
-void
-ilo_gpe_init_rasterizer(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_state *rasterizer);
-void
-ilo_gpe_init_dsa(const struct ilo_dev *dev,
-                 const struct pipe_depth_stencil_alpha_state *state,
-                 struct ilo_dsa_state *dsa);
-
-void
-ilo_gpe_init_blend(const struct ilo_dev *dev,
-                   const struct pipe_blend_state *state,
-                   struct ilo_blend_state *blend);
-
-void
-ilo_gpe_init_sampler_cso(const struct ilo_dev *dev,
-                         const struct pipe_sampler_state *state,
-                         struct ilo_sampler_cso *sampler);
-
-void
-ilo_gpe_init_view_surface_null(const struct ilo_dev *dev,
-                               unsigned width, unsigned height,
-                               unsigned depth, unsigned level,
-                               struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev,
-                                     const struct ilo_buffer *buf,
-                                     unsigned offset, unsigned size,
-                                     unsigned struct_size,
-                                     enum pipe_format elem_format,
-                                     bool is_rt, bool render_cache_rw,
-                                     struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev,
-                                    const struct ilo_image *img,
-                                    enum pipe_texture_target target,
-                                    enum pipe_format format,
-                                    unsigned first_level,
-                                    unsigned num_levels,
-                                    unsigned first_layer,
-                                    unsigned num_layers,
-                                    bool is_rt,
-                                    struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_zs_surface(const struct ilo_dev *dev,
-                        const struct ilo_image *img,
-                        const struct ilo_image *s8_img,
-                        enum pipe_texture_target target,
-                        enum pipe_format format, unsigned level,
-                        unsigned first_layer, unsigned num_layers,
-                        struct ilo_zs_surface *zs);
-
-void
-ilo_gpe_init_vs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *vs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_init_gs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *gs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_init_fs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *fs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_set_fb(const struct ilo_dev *dev,
-               const struct pipe_framebuffer_state *state,
-               struct ilo_fb_state *fb);
-
-#endif /* ILO_STATE_3D_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c b/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c
deleted file mode 100644
index 5a4c5dde7e7..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c
+++ /dev/null
@@ -1,2222 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <[email protected]>
- */
-
-#include "genhw/genhw.h"
-#include "util/u_dual_blend.h"
-#include "util/u_framebuffer.h"
-#include "util/u_half.h"
-
-#include "ilo_format.h"
-#include "ilo_image.h"
-#include "ilo_state_3d.h"
-#include "../ilo_shader.h"
-
-static void
-rasterizer_init_clip(const struct ilo_dev *dev,
-                     const struct pipe_rasterizer_state *state,
-                     struct ilo_rasterizer_clip *clip)
-{
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   dw1 = GEN6_CLIP_DW1_STATISTICS;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 219:
-       *
-       *     "Workaround : Due to Hardware issue "EarlyCull" needs to be
-       *      enabled only for the cases where the incoming primitive topology
-       *      into the clipper guaranteed to be Trilist."
-       *
-       * What does this mean?
-       */
-      dw1 |= 0 << 19 |
-             GEN7_CLIP_DW1_EARLY_CULL_ENABLE;
-
-      if (ilo_dev_gen(dev) < ILO_GEN(8)) {
-         if (state->front_ccw)
-            dw1 |= GEN7_CLIP_DW1_FRONTWINDING_CCW;
-
-         switch (state->cull_face) {
-         case PIPE_FACE_NONE:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_NONE;
-            break;
-         case PIPE_FACE_FRONT:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_FRONT;
-            break;
-         case PIPE_FACE_BACK:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_BACK;
-            break;
-         case PIPE_FACE_FRONT_AND_BACK:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_BOTH;
-            break;
-         }
-      }
-   }
-
-   dw2 = GEN6_CLIP_DW2_CLIP_ENABLE |
-         GEN6_CLIP_DW2_XY_TEST_ENABLE |
-         state->clip_plane_enable << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT |
-         GEN6_CLIP_DW2_CLIPMODE_NORMAL;
-
-   if (state->clip_halfz)
-      dw2 |= GEN6_CLIP_DW2_APIMODE_D3D;
-   else
-      dw2 |= GEN6_CLIP_DW2_APIMODE_OGL;
-
-   if (ilo_dev_gen(dev) < ILO_GEN(8) && state->depth_clip)
-      dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE;
-
-   if (state->flatshade_first) {
-      dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
-             0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
-             1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
-   }
-   else {
-      dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
-             1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
-             2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT |
-         0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT;
-
-   clip->payload[0] = dw1;
-   clip->payload[1] = dw2;
-   clip->payload[2] = dw3;
-
-   clip->can_enable_guardband = true;
-
-   /*
-    * There are several reasons that guard band test should be disabled
-    *
-    *  - GL wide points (to avoid partially visibie object)
-    *  - GL wide or AA lines (to avoid partially visibie object)
-    */
-   if (state->point_size_per_vertex || state->point_size > 1.0f)
-      clip->can_enable_guardband = false;
-   if (state->line_smooth || state->line_width > 1.0f)
-      clip->can_enable_guardband = false;
-}
-
-static void
-rasterizer_init_sf_depth_offset_gen6(const struct ilo_dev *dev,
-                                     const struct pipe_rasterizer_state *state,
-                                     struct ilo_rasterizer_sf *sf)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   /*
-    * Scale the constant term.  The minimum representable value used by the HW
-    * is not large enouch to be the minimum resolvable difference.
-    */
-   sf->dw_depth_offset_const = fui(state->offset_units * 2.0f);
-   sf->dw_depth_offset_scale = fui(state->offset_scale);
-   sf->dw_depth_offset_clamp = fui(state->offset_clamp);
-}
-
-static void
-rasterizer_init_sf_gen6(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_sf *sf)
-{
-   int line_width, point_width;
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "This bit (Statistics Enable) should be set whenever clipping is
-    *      enabled and the Statistics Enable bit is set in CLIP_STATE. It
-    *      should be cleared if clipping is disabled or Statistics Enable in
-    *      CLIP_STATE is clear."
-    */
-   dw1 = GEN7_SF_DW1_STATISTICS |
-         GEN7_SF_DW1_VIEWPORT_ENABLE;
-
-   /* XXX GEN6 path seems to work fine for GEN7 */
-   if (false && ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 258:
-       *
-       *     "This bit (Legacy Global Depth Bias Enable, Global Depth Offset
-       *      Enable Solid , Global Depth Offset Enable Wireframe, and Global
-       *      Depth Offset Enable Point) should be set whenever non zero depth
-       *      bias (Slope, Bias) values are used. Setting this bit may have
-       *      some degradation of performance for some workloads."
-       */
-      if (state->offset_tri || state->offset_line || state->offset_point) {
-         /* XXX need to scale offset_const according to the depth format */
-         dw1 |= GEN7_SF_DW1_LEGACY_DEPTH_OFFSET;
-
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID |
-                GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME |
-                GEN7_SF_DW1_DEPTH_OFFSET_POINT;
-      }
-   } else {
-      if (state->offset_tri)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID;
-      if (state->offset_line)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME;
-      if (state->offset_point)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT;
-   }
-
-   switch (state->fill_front) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_POINT;
-      break;
-   }
-
-   switch (state->fill_back) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw1 |= GEN7_SF_DW1_BACKFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw1 |= GEN7_SF_DW1_BACKFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw1 |= GEN7_SF_DW1_BACKFACE_POINT;
-      break;
-   }
-
-   if (state->front_ccw)
-      dw1 |= GEN7_SF_DW1_FRONTWINDING_CCW;
-
-   dw2 = 0;
-
-   if (state->line_smooth) {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 251:
-       *
-       *     "This field (Anti-aliasing Enable) must be disabled if any of the
-       *      render targets have integer (UINT or SINT) surface format."
-       *
-       * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-       *
-       *     "This field (Hierarchical Depth Buffer Enable) must be disabled
-       *      if Anti-aliasing Enable in 3DSTATE_SF is enabled.
-       *
-       * TODO We do not check those yet.
-       */
-      dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE |
-             GEN7_SF_DW2_AA_LINE_CAP_1_0;
-   }
-
-   switch (state->cull_face) {
-   case PIPE_FACE_NONE:
-      dw2 |= GEN7_SF_DW2_CULLMODE_NONE;
-      break;
-   case PIPE_FACE_FRONT:
-      dw2 |= GEN7_SF_DW2_CULLMODE_FRONT;
-      break;
-   case PIPE_FACE_BACK:
-      dw2 |= GEN7_SF_DW2_CULLMODE_BACK;
-      break;
-   case PIPE_FACE_FRONT_AND_BACK:
-      dw2 |= GEN7_SF_DW2_CULLMODE_BOTH;
-      break;
-   }
-
-   /*
-    * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1)
-    * pixels in the minor direction.  We have to make the lines slightly
-    * thicker, 0.5 pixel on both sides, so that they intersect that many
-    * pixels are considered into the lines.
-    *
-    * Line width is in U3.7.
-    */
-   line_width = (int)
-      ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f);
-   line_width = CLAMP(line_width, 0, 1023);
-
-   /* use GIQ rules */
-   if (line_width == 128 && !state->line_smooth)
-      line_width = 0;
-
-   dw2 |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-
-   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && state->line_stipple_enable)
-      dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE;
-
-   if (state->scissor)
-      dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;
-
-   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
-         GEN7_SF_DW3_SUBPIXEL_8BITS;
-
-   if (state->line_last_pixel)
-      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
-
-   if (state->flatshade_first) {
-      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   } else {
-      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   if (!state->point_size_per_vertex)
-      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH;
-
-   /* in U8.3 */
-   point_width = (int) (state->point_size * 8.0f + 0.5f);
-   point_width = CLAMP(point_width, 1, 2047);
-
-   dw3 |= point_width;
-
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   sf->payload[0] = dw1;
-   sf->payload[1] = dw2;
-   sf->payload[2] = dw3;
-
-   if (state->multisample) {
-      sf->dw_msaa = GEN7_SF_DW2_MSRASTMODE_ON_PATTERN;
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 251:
-       *
-       *     "Software must not program a value of 0.0 when running in
-       *      MSRASTMODE_ON_xxx modes - zero-width lines are not available
-       *      when multisampling rasterization is enabled."
-       */
-      if (!line_width) {
-         line_width = 128; /* 1.0f */
-
-         sf->dw_msaa |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-      }
-   } else {
-      sf->dw_msaa = 0;
-   }
-
-   rasterizer_init_sf_depth_offset_gen6(dev, state, sf);
-   /* 3DSTATE_RASTER is Gen8+ only */
-   sf->dw_raster = 0;
-}
-
-static uint32_t
-rasterizer_get_sf_raster_gen8(const struct ilo_dev *dev,
-                              const struct pipe_rasterizer_state *state)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->front_ccw)
-      dw |= GEN8_RASTER_DW1_FRONTWINDING_CCW;
-
-   switch (state->cull_face) {
-   case PIPE_FACE_NONE:
-      dw |= GEN8_RASTER_DW1_CULLMODE_NONE;
-      break;
-   case PIPE_FACE_FRONT:
-      dw |= GEN8_RASTER_DW1_CULLMODE_FRONT;
-      break;
-   case PIPE_FACE_BACK:
-      dw |= GEN8_RASTER_DW1_CULLMODE_BACK;
-      break;
-   case PIPE_FACE_FRONT_AND_BACK:
-      dw |= GEN8_RASTER_DW1_CULLMODE_BOTH;
-      break;
-   }
-
-   if (state->point_smooth)
-      dw |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE;
-
-   if (state->multisample)
-      dw |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE;
-
-   if (state->offset_tri)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID;
-   if (state->offset_line)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME;
-   if (state->offset_point)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT;
-
-   switch (state->fill_front) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_POINT;
-      break;
-   }
-
-   switch (state->fill_back) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw |= GEN8_RASTER_DW1_BACKFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw |= GEN8_RASTER_DW1_BACKFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw |= GEN8_RASTER_DW1_BACKFACE_POINT;
-      break;
-   }
-
-   if (state->line_smooth)
-      dw |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
-
-   if (state->scissor)
-      dw |= GEN8_RASTER_DW1_SCISSOR_ENABLE;
-
-   if (state->depth_clip)
-      dw |= GEN8_RASTER_DW1_Z_TEST_ENABLE;
-
-   return dw;
-}
-
-static void
-rasterizer_init_sf_gen8(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_sf *sf)
-{
-   int line_width, point_width;
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   /* in U3.7 */
-   line_width = (int)
-      ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f);
-   line_width = CLAMP(line_width, 0, 1023);
-
-   /* use GIQ rules */
-   if (line_width == 128 && !state->line_smooth)
-      line_width = 0;
-
-   /* in U8.3 */
-   point_width = (int) (state->point_size * 8.0f + 0.5f);
-   point_width = CLAMP(point_width, 1, 2047);
-
-   dw1 = GEN7_SF_DW1_STATISTICS |
-         GEN7_SF_DW1_VIEWPORT_ENABLE;
-
-   dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-   if (state->line_smooth)
-      dw2 |= GEN7_SF_DW2_AA_LINE_CAP_1_0;
-
-   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
-         GEN7_SF_DW3_SUBPIXEL_8BITS |
-         point_width;
-
-   if (state->line_last_pixel)
-      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
-
-   if (state->flatshade_first) {
-      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   } else {
-      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   if (!state->point_size_per_vertex)
-      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH;
-
-   dw3 |= point_width;
-
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   sf->payload[0] = dw1;
-   sf->payload[1] = dw2;
-   sf->payload[2] = dw3;
-
-   rasterizer_init_sf_depth_offset_gen6(dev, state, sf);
-
-   sf->dw_msaa = 0;
-   sf->dw_raster = rasterizer_get_sf_raster_gen8(dev, state);
-}
-
-static void
-rasterizer_init_wm_gen6(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_wm *wm)
-{
-   uint32_t dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   /* only the FF unit states are set, as in GEN7 */
-
-   dw5 = GEN6_WM_DW5_AA_LINE_WIDTH_2_0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw5 |= GEN6_WM_DW5_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE;
-
-   /*
-    * assertion that makes sure
-    *
-    *   dw6 |= wm->dw_msaa_rast | wm->dw_msaa_disp;
-    *
-    * is valid
-    */
-   STATIC_ASSERT(GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL == 0 &&
-                 GEN6_WM_DW6_MSDISPMODE_PERSAMPLE == 0);
-   dw6 = GEN6_WM_DW6_ZW_INTERP_PIXEL;
-
-   if (state->bottom_edge_rule)
-      dw6 |= GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT;
-
-   wm->dw_msaa_rast =
-      (state->multisample) ? GEN6_WM_DW6_MSRASTMODE_ON_PATTERN : 0;
-   wm->dw_msaa_disp = GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
-
-   STATIC_ASSERT(Elements(wm->payload) >= 2);
-   wm->payload[0] = dw5;
-   wm->payload[1] = dw6;
-}
-
-static void
-rasterizer_init_wm_gen7(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_wm *wm)
-{
-   uint32_t dw1, dw2;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   /*
-    * assertion that makes sure
-    *
-    *   dw1 |= wm->dw_msaa_rast;
-    *   dw2 |= wm->dw_msaa_disp;
-    *
-    * is valid
-    */
-   STATIC_ASSERT(GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL == 0 &&
-                 GEN7_WM_DW2_MSDISPMODE_PERSAMPLE == 0);
-   dw1 = GEN7_WM_DW1_ZW_INTERP_PIXEL |
-         GEN7_WM_DW1_AA_LINE_WIDTH_2_0;
-   dw2 = 0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw1 |= GEN7_WM_DW1_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
-
-   if (state->bottom_edge_rule)
-      dw1 |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
-
-   wm->dw_msaa_rast =
-      (state->multisample) ? GEN7_WM_DW1_MSRASTMODE_ON_PATTERN : 0;
-   wm->dw_msaa_disp = GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
-
-   STATIC_ASSERT(Elements(wm->payload) >= 2);
-   wm->payload[0] = dw1;
-   wm->payload[1] = dw2;
-}
-
-static uint32_t
-rasterizer_get_wm_gen8(const struct ilo_dev *dev,
-                       const struct pipe_rasterizer_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   dw = GEN7_WM_DW1_ZW_INTERP_PIXEL |
-        GEN7_WM_DW1_AA_LINE_WIDTH_2_0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw |= GEN7_WM_DW1_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
-
-   if (state->bottom_edge_rule)
-      dw |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_rasterizer(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_state *rasterizer)
-{
-   rasterizer_init_clip(dev, state, &rasterizer->clip);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      memset(&rasterizer->wm, 0, sizeof(rasterizer->wm));
-      rasterizer->wm.payload[0] = rasterizer_get_wm_gen8(dev, state);
-
-      rasterizer_init_sf_gen8(dev, state, &rasterizer->sf);
-   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      rasterizer_init_wm_gen7(dev, state, &rasterizer->wm);
-      rasterizer_init_sf_gen6(dev, state, &rasterizer->sf);
-   } else {
-      rasterizer_init_wm_gen6(dev, state, &rasterizer->wm);
-      rasterizer_init_sf_gen6(dev, state, &rasterizer->sf);
-   }
-}
-
-static void
-fs_init_cso_gen6(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, input_count, sampler_count, interps, max_threads;
-   uint32_t dw2, dw4, dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   input_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-   interps = ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS);
-
-   /* see brwCreateContext() */
-   max_threads = (dev->gt == 2) ? 80 : 40;
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = start_grf << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
-         0 << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
-         0 << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
-    *
-    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the
-    *      PS kernel or color calculator has the ability to kill (discard)
-    *      pixels or samples, other than due to depth or stencil testing.
-    *      This bit is required to be ENABLED in the following situations:
-    *
-    *      The API pixel shader program contains "killpix" or "discard"
-    *      instructions, or other code in the pixel shader kernel that can
-    *      cause the final pixel mask to differ from the pixel mask received
-    *      on dispatch.
-    *
-    *      A sampler with chroma key enabled with kill pixel mode is used by
-    *      the pixel shader.
-    *
-    *      Any render target has Alpha Test Enable or AlphaToCoverage Enable
-    *      enabled.
-    *
-    *      The pixel shader kernel generates and outputs oMask.
-    *
-    *      Note: As ClipDistance clipping is fully supported in hardware and
-    *      therefore not via PS instructions, there should be no need to
-    *      ENABLE this bit due to ClipDistance clipping."
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
-    *
-    *     "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth
-    *      field must be set to disabled."
-    *
-    * TODO This is not checked yet.
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw5 |= GEN6_WM_DW5_PS_USE_W;
-
-   /*
-    * TODO set this bit only when
-    *
-    *  a) fs writes colors and color is not masked, or
-    *  b) fs writes depth, or
-    *  c) fs or cc kills
-    */
-   if (true)
-      dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw5 |= GEN6_PS_DISPATCH_8 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;
-
-   dw6 = input_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
-         GEN6_WM_DW6_PS_POSOFFSET_NONE |
-         interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = dw6;
-}
-
-static uint32_t
-fs_get_wm_gen7(const struct ilo_dev *dev,
-               const struct ilo_shader_state *fs)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   dw = ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) <<
-      GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT;
-
-   /*
-    * TODO set this bit only when
-    *
-    *  a) fs writes colors and color is not masked, or
-    *  b) fs writes depth, or
-    *  c) fs or cc kills
-    */
-   dw |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 278:
-    *
-    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that
-    *      the PS kernel or color calculator has the ability to kill
-    *      (discard) pixels or samples, other than due to depth or stencil
-    *      testing. This bit is required to be ENABLED in the following
-    *      situations:
-    *
-    *      - The API pixel shader program contains "killpix" or "discard"
-    *        instructions, or other code in the pixel shader kernel that
-    *        can cause the final pixel mask to differ from the pixel mask
-    *        received on dispatch.
-    *
-    *      - A sampler with chroma key enabled with kill pixel mode is used
-    *        by the pixel shader.
-    *
-    *      - Any render target has Alpha Test Enable or AlphaToCoverage
-    *        Enable enabled.
-    *
-    *      - The pixel shader kernel generates and outputs oMask.
-    *
-    *      Note: As ClipDistance clipping is fully supported in hardware
-    *      and therefore not via PS instructions, there should be no need
-    *      to ENABLE this bit due to ClipDistance clipping."
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw |= GEN7_WM_DW1_PS_KILL_PIXEL;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw |= GEN7_WM_DW1_PSCDEPTH_ON;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw |= GEN7_WM_DW1_PS_USE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw |= GEN7_WM_DW1_PS_USE_W;
-
-   return dw;
-}
-
-static void
-fs_init_cso_gen7(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = GEN7_PS_DW4_POSOFFSET_NONE;
-
-   /* see brwCreateContext() */
-   switch (ilo_dev_gen(dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (dev->gt == 3) ? 408 : (dev->gt == 2) ? 204 : 102;
-      dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
-      dw4 |= 1 << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
-      break;
-   case ILO_GEN(7):
-   default:
-      max_threads = (dev->gt == 2) ? 172 : 48;
-      dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   }
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE))
-      dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT))
-      dw4 |= GEN7_PS_DW4_ATTR_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw4 |= GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
-
-   dw5 = start_grf << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
-         0 << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
-         0 << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = fs_get_wm_gen7(dev, fs);
-}
-
-static uint32_t
-fs_get_psx_gen8(const struct ilo_dev *dev,
-                const struct ilo_shader_state *fs)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   dw = GEN8_PSX_DW1_DISPATCH_ENABLE;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw |= GEN8_PSX_DW1_KILL_PIXEL;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw |= GEN8_PSX_DW1_PSCDEPTH_ON;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw |= GEN8_PSX_DW1_USE_DEPTH;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw |= GEN8_PSX_DW1_USE_W;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT))
-      dw |= GEN8_PSX_DW1_ATTR_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-fs_get_wm_gen8(const struct ilo_dev *dev,
-               const struct ilo_shader_state *fs)
-{
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   return ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) <<
-      GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT;
-}
-
-static void
-fs_init_cso_gen8(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, sampler_count;
-   uint32_t dw3, dw6, dw7;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-
-   dw3 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw3 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   /* always 64? */
-   dw6 = (64 - 2) << GEN8_PS_DW6_MAX_THREADS__SHIFT |
-         GEN8_PS_DW6_POSOFFSET_NONE;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE))
-      dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw6 |= GEN6_PS_DISPATCH_8 << GEN8_PS_DW6_DISPATCH_MODE__SHIFT;
-
-   dw7 = start_grf << GEN8_PS_DW7_URB_GRF_START0__SHIFT |
-         0 << GEN8_PS_DW7_URB_GRF_START1__SHIFT |
-         0 << GEN8_PS_DW7_URB_GRF_START2__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 5);
-   cso->payload[0] = dw3;
-   cso->payload[1] = dw6;
-   cso->payload[2] = dw7;
-   cso->payload[3] = fs_get_psx_gen8(dev, fs);
-   cso->payload[4] = fs_get_wm_gen8(dev, fs);
-}
-
-void
-ilo_gpe_init_fs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *fs,
-                    struct ilo_shader_cso *cso)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      fs_init_cso_gen8(dev, fs, cso);
-   else if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      fs_init_cso_gen7(dev, fs, cso);
-   else
-      fs_init_cso_gen6(dev, fs, cso);
-}
-
-struct ilo_zs_surface_info {
-   int surface_type;
-   int format;
-
-   struct {
-      struct intel_bo *bo;
-      unsigned stride;
-      unsigned qpitch;
-      enum gen_surface_tiling tiling;
-      uint32_t offset;
-   } zs, stencil, hiz;
-
-   unsigned width, height, depth;
-   unsigned lod, first_layer, num_layers;
-};
-
-static void
-zs_init_info_null(const struct ilo_dev *dev,
-                  struct ilo_zs_surface_info *info)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(info, 0, sizeof(*info));
-
-   info->surface_type = GEN6_SURFTYPE_NULL;
-   info->format = GEN6_ZFORMAT_D32_FLOAT;
-   info->width = 1;
-   info->height = 1;
-   info->depth = 1;
-   info->num_layers = 1;
-}
-
-static void
-zs_init_info(const struct ilo_dev *dev,
-             const struct ilo_image *img,
-             const struct ilo_image *s8_img,
-             enum pipe_texture_target target,
-             enum pipe_format format, unsigned level,
-             unsigned first_layer, unsigned num_layers,
-             struct ilo_zs_surface_info *info)
-{
-   bool separate_stencil;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(info, 0, sizeof(*info));
-
-   info->surface_type = ilo_gpe_gen6_translate_texture(target);
-
-   if (info->surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 325-326:
-       *
-       *     "For Other Surfaces (Cube Surfaces):
-       *      This field (Minimum Array Element) is ignored."
-       *
-       *     "For Other Surfaces (Cube Surfaces):
-       *      This field (Render Target View Extent) is ignored."
-       *
-       * As such, we cannot set first_layer and num_layers on cube surfaces.
-       * To work around that, treat it as a 2D surface.
-       */
-      info->surface_type = GEN6_SURFTYPE_2D;
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      separate_stencil = true;
-   } else {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-       *
-       *     "This field (Separate Stencil Buffer Enable) must be set to the
-       *      same value (enabled or disabled) as Hierarchical Depth Buffer
-       *      Enable."
-       */
-      separate_stencil = ilo_image_can_enable_aux(img, level);
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-    *
-    *     "If this field (Hierarchical Depth Buffer Enable) is enabled, the
-    *      Surface Format of the depth buffer cannot be
-    *      D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT. Use of stencil
-    *      requires the separate stencil buffer."
-    *
-    * From the Ironlake PRM, volume 2 part 1, page 330:
-    *
-    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
-    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
-    *
-    * There is no similar restriction for GEN6.  But when D24_UNORM_X8_UINT
-    * is indeed used, the depth values output by the fragment shaders will
-    * be different when read back.
-    *
-    * As for GEN7+, separate_stencil is always true.
-    */
-   switch (format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      info->format = GEN6_ZFORMAT_D16_UNORM;
-      break;
-   case PIPE_FORMAT_Z32_FLOAT:
-      info->format = GEN6_ZFORMAT_D32_FLOAT;
-      break;
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-      info->format = (separate_stencil) ?
-         GEN6_ZFORMAT_D24_UNORM_X8_UINT :
-         GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-      break;
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      info->format = (separate_stencil) ?
-         GEN6_ZFORMAT_D32_FLOAT :
-         GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
-      break;
-   case PIPE_FORMAT_S8_UINT:
-      if (separate_stencil) {
-         info->format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      }
-      /* fall through */
-   default:
-      assert(!"unsupported depth/stencil format");
-      zs_init_info_null(dev, info);
-      return;
-      break;
-   }
-
-   if (format != PIPE_FORMAT_S8_UINT) {
-      info->zs.bo = img->bo;
-      info->zs.stride = img->bo_stride;
-
-      assert(img->walk_layer_height % 4 == 0);
-      info->zs.qpitch = img->walk_layer_height / 4;
-
-      info->zs.tiling = img->tiling;
-      info->zs.offset = 0;
-   }
-
-   if (s8_img || format == PIPE_FORMAT_S8_UINT) {
-      info->stencil.bo = s8_img->bo;
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 329:
-       *
-       *     "The pitch must be set to 2x the value computed based on width,
-       *       as the stencil buffer is stored with two rows interleaved."
-       *
-       * For GEN7, we still dobule the stride because we did not double the
-       * slice widths when initializing the layout.
-       */
-      info->stencil.stride = s8_img->bo_stride * 2;
-
-      assert(s8_img->walk_layer_height % 4 == 0);
-      info->stencil.qpitch = s8_img->walk_layer_height / 4;
-
-      info->stencil.tiling = s8_img->tiling;
-
-      if (ilo_dev_gen(dev) == ILO_GEN(6)) {
-         unsigned x, y;
-
-         assert(s8_img->walk == ILO_IMAGE_WALK_LOD);
-
-         /* offset to the level */
-         ilo_image_get_slice_pos(s8_img, level, 0, &x, &y);
-         ilo_image_pos_to_mem(s8_img, x, y, &x, &y);
-         info->stencil.offset = ilo_image_mem_to_raw(s8_img, x, y);
-      }
-   }
-
-   if (ilo_image_can_enable_aux(img, level)) {
-      info->hiz.bo = img->aux.bo;
-      info->hiz.stride = img->aux.bo_stride;
-
-      assert(img->aux.walk_layer_height % 4 == 0);
-      info->hiz.qpitch = img->aux.walk_layer_height / 4;
-
-      info->hiz.tiling = GEN6_TILING_Y;
-
-      /* offset to the level */
-      if (ilo_dev_gen(dev) == ILO_GEN(6))
-         info->hiz.offset = img->aux.walk_lod_offsets[level];
-   }
-
-   info->width = img->width0;
-   info->height = img->height0;
-   info->depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-
-   info->lod = level;
-   info->first_layer = first_layer;
-   info->num_layers = num_layers;
-}
-
-void
-ilo_gpe_init_zs_surface(const struct ilo_dev *dev,
-                        const struct ilo_image *img,
-                        const struct ilo_image *s8_img,
-                        enum pipe_texture_target target,
-                        enum pipe_format format, unsigned level,
-                        unsigned first_layer, unsigned num_layers,
-                        struct ilo_zs_surface *zs)
-{
-   const int max_2d_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
-   const int max_array_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
-   struct ilo_zs_surface_info info;
-   uint32_t dw1, dw2, dw3, dw4, dw5, dw6;
-   int align_w = 8, align_h = 4;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (img) {
-      zs_init_info(dev, img, s8_img, target, format,
-            level, first_layer, num_layers, &info);
-
-      switch (img->sample_count) {
-      case 2:
-         align_w /= 2;
-         break;
-      case 4:
-         align_w /= 2;
-         align_h /= 2;
-         break;
-      case 8:
-         align_w /= 4;
-         align_h /= 2;
-         break;
-      case 16:
-         align_w /= 4;
-         align_h /= 4;
-         break;
-      default:
-         break;
-      }
-   } else {
-      zs_init_info_null(dev, &info);
-   }
-
-   switch (info.surface_type) {
-   case GEN6_SURFTYPE_NULL:
-      break;
-   case GEN6_SURFTYPE_1D:
-      assert(info.width <= max_2d_size && info.height == 1 &&
-             info.depth <= max_array_size);
-      assert(info.first_layer < max_array_size - 1 &&
-             info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(info.width <= max_2d_size && info.height <= max_2d_size &&
-             info.depth <= max_array_size);
-      assert(info.first_layer < max_array_size - 1 &&
-             info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(info.width <= 2048 && info.height <= 2048 && info.depth <= 2048);
-      assert(info.first_layer < 2048 && info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(info.width <= max_2d_size && info.height <= max_2d_size &&
-             info.depth == 1);
-      assert(info.first_layer == 0 && info.num_layers == 1);
-      assert(info.width == info.height);
-      break;
-   default:
-      assert(!"unexpected depth surface type");
-      break;
-   }
-
-   dw1 = info.surface_type << GEN6_DEPTH_DW1_TYPE__SHIFT |
-         info.format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
-
-   if (info.zs.bo) {
-      /* required for GEN6+ */
-      assert(info.zs.tiling == GEN6_TILING_Y);
-      assert(info.zs.stride > 0 && info.zs.stride < 128 * 1024 &&
-            info.zs.stride % 128 == 0);
-      assert(info.width <= info.zs.stride);
-
-      dw1 |= (info.zs.stride - 1);
-      dw2 = info.zs.offset;
-   } else {
-      dw2 = 0;
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      if (info.zs.bo)
-         dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
-
-      if (info.stencil.bo)
-         dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE;
-
-      if (info.hiz.bo)
-         dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
-
-      dw3 = (info.height - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
-            (info.width - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT |
-            info.lod << GEN7_DEPTH_DW3_LOD__SHIFT;
-
-      zs->dw_aligned_8x4 =
-         (align(info.height, align_h) - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
-         (align(info.width, align_w) - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT |
-         info.lod << GEN7_DEPTH_DW3_LOD__SHIFT;
-
-      dw4 = (info.depth - 1) << GEN7_DEPTH_DW4_DEPTH__SHIFT |
-            info.first_layer << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT;
-
-      dw5 = 0;
-
-      dw6 = (info.num_layers - 1) << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT;
-
-      if (ilo_dev_gen(dev) >= ILO_GEN(8))
-         dw6 |= info.zs.qpitch;
-   } else {
-      /* always Y-tiled */
-      dw1 |= GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT;
-
-      if (info.hiz.bo) {
-         dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
-                GEN6_DEPTH_DW1_SEPARATE_STENCIL;
-      }
-
-      dw3 = (info.height - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
-            (info.width - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT |
-            info.lod << GEN6_DEPTH_DW3_LOD__SHIFT |
-            GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
-
-      zs->dw_aligned_8x4 =
-         (align(info.height, align_h) - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
-         (align(info.width, align_w) - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT |
-         info.lod << GEN6_DEPTH_DW3_LOD__SHIFT |
-         GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
-
-      dw4 = (info.depth - 1) << GEN6_DEPTH_DW4_DEPTH__SHIFT |
-            info.first_layer << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT |
-            (info.num_layers - 1) << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT;
-
-      dw5 = 0;
-
-      dw6 = 0;
-   }
-
-   STATIC_ASSERT(Elements(zs->payload) >= 12);
-
-   zs->payload[0] = dw1;
-   zs->payload[1] = dw2;
-   zs->payload[2] = dw3;
-   zs->payload[3] = dw4;
-   zs->payload[4] = dw5;
-   zs->payload[5] = dw6;
-
-   /* do not increment reference count */
-   zs->bo = info.zs.bo;
-
-   /* separate stencil */
-   if (info.stencil.bo) {
-      assert(info.stencil.stride > 0 && info.stencil.stride < 128 * 1024 &&
-             info.stencil.stride % 128 == 0);
-
-      dw1 = (info.stencil.stride - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT;
-      if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
-         dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE;
-
-      dw2 = info.stencil.offset;
-      dw4 = info.stencil.qpitch;
-   } else {
-      dw1 = 0;
-      dw2 = 0;
-      dw4 = 0;
-   }
-
-   zs->payload[6] = dw1;
-   zs->payload[7] = dw2;
-   zs->payload[8] = dw4;
-   /* do not increment reference count */
-   zs->separate_s8_bo = info.stencil.bo;
-
-   /* hiz */
-   if (info.hiz.bo) {
-      dw1 = (info.hiz.stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT;
-      dw2 = info.hiz.offset;
-      dw4 = info.hiz.qpitch;
-   } else {
-      dw1 = 0;
-      dw2 = 0;
-      dw4 = 0;
-   }
-
-   zs->payload[9] = dw1;
-   zs->payload[10] = dw2;
-   zs->payload[11] = dw4;
-   /* do not increment reference count */
-   zs->hiz_bo = info.hiz.bo;
-}
-
-static void
-viewport_get_guardband(const struct ilo_dev *dev,
-                       int center_x, int center_y,
-                       int *min_gbx, int *max_gbx,
-                       int *min_gby, int *max_gby)
-{
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 234:
-    *
-    *     "Per-Device Guardband Extents
-    *
-    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1]
-    *       - Maximum Post-Clamp Delta (X or Y): 16K"
-    *
-    *     "In addition, in order to be correctly rendered, objects must have a
-    *      screenspace bounding box not exceeding 8K in the X or Y direction.
-    *      This additional restriction must also be comprehended by software,
-    *      i.e., enforced by use of clipping."
-    *
-    * From the Ivy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "Per-Device Guardband Extents
-    *
-    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1]
-    *       - Maximum Post-Clamp Delta (X or Y): N/A"
-    *
-    *     "In addition, in order to be correctly rendered, objects must have a
-    *      screenspace bounding box not exceeding 8K in the X or Y direction.
-    *      This additional restriction must also be comprehended by software,
-    *      i.e., enforced by use of clipping."
-    *
-    * Combined, the bounding box of any object can not exceed 8K in both
-    * width and height.
-    *
-    * Below we set the guardband as a squre of length 8K, centered at where
-    * the viewport is.  This makes sure all objects passing the GB test are
-    * valid to the renderer, and those failing the XY clipping have a
-    * better chance of passing the GB test.
-    */
-   const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384;
-   const int half_len = 8192 / 2;
-
-   /* make sure the guardband is within the valid range */
-   if (center_x - half_len < -max_extent)
-      center_x = -max_extent + half_len;
-   else if (center_x + half_len > max_extent - 1)
-      center_x = max_extent - half_len;
-
-   if (center_y - half_len < -max_extent)
-      center_y = -max_extent + half_len;
-   else if (center_y + half_len > max_extent - 1)
-      center_y = max_extent - half_len;
-
-   *min_gbx = (float) (center_x - half_len);
-   *max_gbx = (float) (center_x + half_len);
-   *min_gby = (float) (center_y - half_len);
-   *max_gby = (float) (center_y + half_len);
-}
-
-void
-ilo_gpe_set_viewport_cso(const struct ilo_dev *dev,
-                         const struct pipe_viewport_state *state,
-                         struct ilo_viewport_cso *vp)
-{
-   const float scale_x = fabs(state->scale[0]);
-   const float scale_y = fabs(state->scale[1]);
-   const float scale_z = fabs(state->scale[2]);
-   int min_gbx, max_gbx, min_gby, max_gby;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   viewport_get_guardband(dev,
-         (int) state->translate[0],
-         (int) state->translate[1],
-         &min_gbx, &max_gbx, &min_gby, &max_gby);
-
-   /* matrix form */
-   vp->m00 = state->scale[0];
-   vp->m11 = state->scale[1];
-   vp->m22 = state->scale[2];
-   vp->m30 = state->translate[0];
-   vp->m31 = state->translate[1];
-   vp->m32 = state->translate[2];
-
-   /* guardband in NDC space */
-   vp->min_gbx = ((float) min_gbx - state->translate[0]) / scale_x;
-   vp->max_gbx = ((float) max_gbx - state->translate[0]) / scale_x;
-   vp->min_gby = ((float) min_gby - state->translate[1]) / scale_y;
-   vp->max_gby = ((float) max_gby - state->translate[1]) / scale_y;
-
-   /* viewport in screen space */
-   vp->min_x = scale_x * -1.0f + state->translate[0];
-   vp->max_x = scale_x *  1.0f + state->translate[0];
-   vp->min_y = scale_y * -1.0f + state->translate[1];
-   vp->max_y = scale_y *  1.0f + state->translate[1];
-   vp->min_z = scale_z * -1.0f + state->translate[2];
-   vp->max_z = scale_z *  1.0f + state->translate[2];
-}
-
-/**
- * Translate a pipe logicop to the matching hardware logicop.
- */
-static int
-gen6_translate_pipe_logicop(unsigned logicop)
-{
-   switch (logicop) {
-   case PIPE_LOGICOP_CLEAR:         return GEN6_LOGICOP_CLEAR;
-   case PIPE_LOGICOP_NOR:           return GEN6_LOGICOP_NOR;
-   case PIPE_LOGICOP_AND_INVERTED:  return GEN6_LOGICOP_AND_INVERTED;
-   case PIPE_LOGICOP_COPY_INVERTED: return GEN6_LOGICOP_COPY_INVERTED;
-   case PIPE_LOGICOP_AND_REVERSE:   return GEN6_LOGICOP_AND_REVERSE;
-   case PIPE_LOGICOP_INVERT:        return GEN6_LOGICOP_INVERT;
-   case PIPE_LOGICOP_XOR:           return GEN6_LOGICOP_XOR;
-   case PIPE_LOGICOP_NAND:          return GEN6_LOGICOP_NAND;
-   case PIPE_LOGICOP_AND:           return GEN6_LOGICOP_AND;
-   case PIPE_LOGICOP_EQUIV:         return GEN6_LOGICOP_EQUIV;
-   case PIPE_LOGICOP_NOOP:          return GEN6_LOGICOP_NOOP;
-   case PIPE_LOGICOP_OR_INVERTED:   return GEN6_LOGICOP_OR_INVERTED;
-   case PIPE_LOGICOP_COPY:          return GEN6_LOGICOP_COPY;
-   case PIPE_LOGICOP_OR_REVERSE:    return GEN6_LOGICOP_OR_REVERSE;
-   case PIPE_LOGICOP_OR:            return GEN6_LOGICOP_OR;
-   case PIPE_LOGICOP_SET:           return GEN6_LOGICOP_SET;
-   default:
-      assert(!"unknown logicop function");
-      return GEN6_LOGICOP_CLEAR;
-   }
-}
-
-/**
- * Translate a pipe blend function to the matching hardware blend function.
- */
-static int
-gen6_translate_pipe_blend(unsigned blend)
-{
-   switch (blend) {
-   case PIPE_BLEND_ADD:                return GEN6_BLENDFUNCTION_ADD;
-   case PIPE_BLEND_SUBTRACT:           return GEN6_BLENDFUNCTION_SUBTRACT;
-   case PIPE_BLEND_REVERSE_SUBTRACT:   return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT;
-   case PIPE_BLEND_MIN:                return GEN6_BLENDFUNCTION_MIN;
-   case PIPE_BLEND_MAX:                return GEN6_BLENDFUNCTION_MAX;
-   default:
-      assert(!"unknown blend function");
-      return GEN6_BLENDFUNCTION_ADD;
-   };
-}
-
-/**
- * Translate a pipe blend factor to the matching hardware blend factor.
- */
-static int
-gen6_translate_pipe_blendfactor(unsigned blendfactor)
-{
-   switch (blendfactor) {
-   case PIPE_BLENDFACTOR_ONE:                return GEN6_BLENDFACTOR_ONE;
-   case PIPE_BLENDFACTOR_SRC_COLOR:          return GEN6_BLENDFACTOR_SRC_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:          return GEN6_BLENDFACTOR_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_DST_ALPHA:          return GEN6_BLENDFACTOR_DST_ALPHA;
-   case PIPE_BLENDFACTOR_DST_COLOR:          return GEN6_BLENDFACTOR_DST_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE;
-   case PIPE_BLENDFACTOR_CONST_COLOR:        return GEN6_BLENDFACTOR_CONST_COLOR;
-   case PIPE_BLENDFACTOR_CONST_ALPHA:        return GEN6_BLENDFACTOR_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:         return GEN6_BLENDFACTOR_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:         return GEN6_BLENDFACTOR_SRC1_ALPHA;
-   case PIPE_BLENDFACTOR_ZERO:               return GEN6_BLENDFACTOR_ZERO;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:      return GEN6_BLENDFACTOR_INV_SRC_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:      return GEN6_BLENDFACTOR_INV_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:      return GEN6_BLENDFACTOR_INV_DST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:      return GEN6_BLENDFACTOR_INV_DST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:    return GEN6_BLENDFACTOR_INV_CONST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:    return GEN6_BLENDFACTOR_INV_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:     return GEN6_BLENDFACTOR_INV_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:     return GEN6_BLENDFACTOR_INV_SRC1_ALPHA;
-   default:
-      assert(!"unknown blend factor");
-      return GEN6_BLENDFACTOR_ONE;
-   };
-}
-
-/**
- * Translate a pipe stencil op to the matching hardware stencil op.
- */
-static int
-gen6_translate_pipe_stencil_op(unsigned stencil_op)
-{
-   switch (stencil_op) {
-   case PIPE_STENCIL_OP_KEEP:       return GEN6_STENCILOP_KEEP;
-   case PIPE_STENCIL_OP_ZERO:       return GEN6_STENCILOP_ZERO;
-   case PIPE_STENCIL_OP_REPLACE:    return GEN6_STENCILOP_REPLACE;
-   case PIPE_STENCIL_OP_INCR:       return GEN6_STENCILOP_INCRSAT;
-   case PIPE_STENCIL_OP_DECR:       return GEN6_STENCILOP_DECRSAT;
-   case PIPE_STENCIL_OP_INCR_WRAP:  return GEN6_STENCILOP_INCR;
-   case PIPE_STENCIL_OP_DECR_WRAP:  return GEN6_STENCILOP_DECR;
-   case PIPE_STENCIL_OP_INVERT:     return GEN6_STENCILOP_INVERT;
-   default:
-      assert(!"unknown stencil op");
-      return GEN6_STENCILOP_KEEP;
-   }
-}
-
-static int
-gen6_blend_factor_dst_alpha_forced_one(int factor)
-{
-   switch (factor) {
-   case GEN6_BLENDFACTOR_DST_ALPHA:
-      return GEN6_BLENDFACTOR_ONE;
-   case GEN6_BLENDFACTOR_INV_DST_ALPHA:
-   case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      return GEN6_BLENDFACTOR_ZERO;
-   default:
-      return factor;
-   }
-}
-
-static uint32_t
-blend_get_rt_blend_enable_gen6(const struct ilo_dev *dev,
-                               const struct pipe_rt_blend_state *rt,
-                               bool dst_alpha_forced_one)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!rt->blend_enable)
-      return 0;
-
-   rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor);
-   rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor);
-   a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor);
-   a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor);
-
-   if (dst_alpha_forced_one) {
-      rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src);
-      rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst);
-      a_src = gen6_blend_factor_dst_alpha_forced_one(a_src);
-      a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst);
-   }
-
-   dw = GEN6_RT_DW0_BLEND_ENABLE |
-        gen6_translate_pipe_blend(rt->alpha_func) << 26 |
-        a_src << 20 |
-        a_dst << 15 |
-        gen6_translate_pipe_blend(rt->rgb_func) << 11 |
-        rgb_src << 5 |
-        rgb_dst;
-
-   if (rt->rgb_func != rt->alpha_func ||
-       rgb_src != a_src || rgb_dst != a_dst)
-      dw |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_rt_blend_enable_gen8(const struct ilo_dev *dev,
-                               const struct pipe_rt_blend_state *rt,
-                               bool dst_alpha_forced_one,
-                               bool *independent_alpha)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!rt->blend_enable) {
-      *independent_alpha = false;
-      return 0;
-   }
-
-   rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor);
-   rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor);
-   a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor);
-   a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor);
-
-   if (dst_alpha_forced_one) {
-      rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src);
-      rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst);
-      a_src = gen6_blend_factor_dst_alpha_forced_one(a_src);
-      a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst);
-   }
-
-   dw = GEN8_RT_DW0_BLEND_ENABLE |
-        rgb_src << 26 |
-        rgb_dst << 21 |
-        gen6_translate_pipe_blend(rt->rgb_func) << 18 |
-        a_src << 13 |
-        a_dst << 8 |
-        gen6_translate_pipe_blend(rt->alpha_func) << 5;
-
-   *independent_alpha = (rt->rgb_func != rt->alpha_func ||
-                         rgb_src != a_src ||
-                         rgb_dst != a_dst);
-
-   return dw;
-}
-
-static void
-blend_init_cso_gen6(const struct ilo_dev *dev,
-                    const struct pipe_blend_state *state,
-                    struct ilo_blend_state *blend,
-                    unsigned index)
-{
-   const struct pipe_rt_blend_state *rt = &state->rt[index];
-   struct ilo_blend_cso *cso = &blend->cso[index];
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   cso->payload[0] = 0;
-   cso->payload[1] = GEN6_RT_DW1_COLORCLAMP_RTFORMAT |
-                     GEN6_RT_DW1_PRE_BLEND_CLAMP |
-                     GEN6_RT_DW1_POST_BLEND_CLAMP;
-
-   if (!(rt->colormask & PIPE_MASK_A))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_A;
-   if (!(rt->colormask & PIPE_MASK_R))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_R;
-   if (!(rt->colormask & PIPE_MASK_G))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_G;
-   if (!(rt->colormask & PIPE_MASK_B))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_B;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
-    *
-    *     "Color Buffer Blending and Logic Ops must not be enabled
-    *      simultaneously, or behavior is UNDEFINED."
-    *
-    * Since state->logicop_enable takes precedence over rt->blend_enable,
-    * no special care is needed.
-    */
-   if (state->logicop_enable) {
-      cso->dw_blend = 0;
-      cso->dw_blend_dst_alpha_forced_one = 0;
-   } else {
-      cso->dw_blend = blend_get_rt_blend_enable_gen6(dev, rt, false);
-      cso->dw_blend_dst_alpha_forced_one =
-         blend_get_rt_blend_enable_gen6(dev, rt, true);
-   }
-}
-
-static bool
-blend_init_cso_gen8(const struct ilo_dev *dev,
-                    const struct pipe_blend_state *state,
-                    struct ilo_blend_state *blend,
-                    unsigned index)
-{
-   const struct pipe_rt_blend_state *rt = &state->rt[index];
-   struct ilo_blend_cso *cso = &blend->cso[index];
-   bool independent_alpha = false;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   cso->payload[0] = 0;
-   cso->payload[1] = GEN8_RT_DW1_COLORCLAMP_RTFORMAT |
-                     GEN8_RT_DW1_PRE_BLEND_CLAMP |
-                     GEN8_RT_DW1_POST_BLEND_CLAMP;
-
-   if (!(rt->colormask & PIPE_MASK_A))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_A;
-   if (!(rt->colormask & PIPE_MASK_R))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_R;
-   if (!(rt->colormask & PIPE_MASK_G))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_G;
-   if (!(rt->colormask & PIPE_MASK_B))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_B;
-
-   if (state->logicop_enable) {
-      cso->dw_blend = 0;
-      cso->dw_blend_dst_alpha_forced_one = 0;
-   } else {
-      bool tmp[2];
-
-      cso->dw_blend = blend_get_rt_blend_enable_gen8(dev, rt, false, &tmp[0]);
-      cso->dw_blend_dst_alpha_forced_one =
-         blend_get_rt_blend_enable_gen8(dev, rt, true, &tmp[1]);
-
-      if (tmp[0] || tmp[1])
-         independent_alpha = true;
-   }
-
-   return independent_alpha;
-}
-
-static uint32_t
-blend_get_logicop_enable_gen6(const struct ilo_dev *dev,
-                              const struct pipe_blend_state *state)
-{
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!state->logicop_enable)
-      return 0;
-
-   return GEN6_RT_DW1_LOGICOP_ENABLE |
-          gen6_translate_pipe_logicop(state->logicop_func) << 18;
-}
-
-static uint32_t
-blend_get_logicop_enable_gen8(const struct ilo_dev *dev,
-                              const struct pipe_blend_state *state)
-{
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!state->logicop_enable)
-      return 0;
-
-   return GEN8_RT_DW1_LOGICOP_ENABLE |
-          gen6_translate_pipe_logicop(state->logicop_func) << 27;
-}
-
-static uint32_t
-blend_get_alpha_mod_gen6(const struct ilo_dev *dev,
-                         const struct pipe_blend_state *state,
-                         bool dual_blend)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (state->alpha_to_coverage) {
-      dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE;
-      if (ilo_dev_gen(dev) >= ILO_GEN(7))
-         dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER;
-   }
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
-    *
-    *     "If Dual Source Blending is enabled, this bit (AlphaToOne Enable)
-    *      must be disabled."
-    */
-   if (state->alpha_to_one && !dual_blend)
-      dw |= GEN6_RT_DW1_ALPHA_TO_ONE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_alpha_mod_gen8(const struct ilo_dev *dev,
-                         const struct pipe_blend_state *state,
-                         bool dual_blend)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->alpha_to_coverage) {
-      dw |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE |
-            GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER;
-   }
-
-   if (state->alpha_to_one && !dual_blend)
-      dw |= GEN8_BLEND_DW0_ALPHA_TO_ONE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_ps_blend_gen8(const struct ilo_dev *dev, uint32_t rt_dw0)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!(rt_dw0 & GEN8_RT_DW0_BLEND_ENABLE))
-      return 0;
-
-   a_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_ALPHA_FACTOR);
-   a_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_ALPHA_FACTOR);
-   rgb_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_COLOR_FACTOR);
-   rgb_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_COLOR_FACTOR);
-
-   dw = GEN8_PS_BLEND_DW1_BLEND_ENABLE;
-   dw |= GEN_SHIFT32(a_src, GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR);
-   dw |= GEN_SHIFT32(a_dst, GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR);
-   dw |= GEN_SHIFT32(rgb_src, GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR);
-   dw |= GEN_SHIFT32(rgb_dst, GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR);
-
-   if (a_src != rgb_src || a_dst != rgb_dst)
-      dw |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_blend(const struct ilo_dev *dev,
-                   const struct pipe_blend_state *state,
-                   struct ilo_blend_state *blend)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   blend->dual_blend = (util_blend_state_is_dual(state, 0) &&
-                        state->rt[0].blend_enable &&
-                        !state->logicop_enable);
-   blend->alpha_to_coverage = state->alpha_to_coverage;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      bool independent_alpha;
-
-      blend->dw_alpha_mod =
-         blend_get_alpha_mod_gen8(dev, state, blend->dual_blend);
-      blend->dw_logicop = blend_get_logicop_enable_gen8(dev, state);
-      blend->dw_shared = (state->dither) ? GEN8_BLEND_DW0_DITHER_ENABLE : 0;
-
-      independent_alpha = blend_init_cso_gen8(dev, state, blend, 0);
-      if (independent_alpha)
-         blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
-
-      blend->dw_ps_blend = blend_get_ps_blend_gen8(dev,
-            blend->cso[0].dw_blend);
-      blend->dw_ps_blend_dst_alpha_forced_one = blend_get_ps_blend_gen8(dev,
-            blend->cso[0].dw_blend_dst_alpha_forced_one);
-
-      if (state->independent_blend_enable) {
-         for (i = 1; i < Elements(blend->cso); i++) {
-            independent_alpha = blend_init_cso_gen8(dev, state, blend, i);
-            if (independent_alpha)
-               blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
-         }
-      } else {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend->cso[i] = blend->cso[0];
-      }
-   } else {
-      blend->dw_alpha_mod =
-         blend_get_alpha_mod_gen6(dev, state, blend->dual_blend);
-      blend->dw_logicop = blend_get_logicop_enable_gen6(dev, state);
-      blend->dw_shared = (state->dither) ? GEN6_RT_DW1_DITHER_ENABLE : 0;
-
-      blend->dw_ps_blend = 0;
-      blend->dw_ps_blend_dst_alpha_forced_one = 0;
-
-      blend_init_cso_gen6(dev, state, blend, 0);
-      if (state->independent_blend_enable) {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend_init_cso_gen6(dev, state, blend, i);
-      } else {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend->cso[i] = blend->cso[0];
-      }
-   }
-}
-
-/**
- * Translate a pipe DSA test function to the matching hardware compare
- * function.
- */
-static int
-gen6_translate_dsa_func(unsigned func)
-{
-   switch (func) {
-   case PIPE_FUNC_NEVER:      return GEN6_COMPAREFUNCTION_NEVER;
-   case PIPE_FUNC_LESS:       return GEN6_COMPAREFUNCTION_LESS;
-   case PIPE_FUNC_EQUAL:      return GEN6_COMPAREFUNCTION_EQUAL;
-   case PIPE_FUNC_LEQUAL:     return GEN6_COMPAREFUNCTION_LEQUAL;
-   case PIPE_FUNC_GREATER:    return GEN6_COMPAREFUNCTION_GREATER;
-   case PIPE_FUNC_NOTEQUAL:   return GEN6_COMPAREFUNCTION_NOTEQUAL;
-   case PIPE_FUNC_GEQUAL:     return GEN6_COMPAREFUNCTION_GEQUAL;
-   case PIPE_FUNC_ALWAYS:     return GEN6_COMPAREFUNCTION_ALWAYS;
-   default:
-      assert(!"unknown depth/stencil/alpha test function");
-      return GEN6_COMPAREFUNCTION_NEVER;
-   }
-}
-
-static uint32_t
-dsa_get_stencil_enable_gen6(const struct ilo_dev *dev,
-                            const struct pipe_stencil_state *stencil0,
-                            const struct pipe_stencil_state *stencil1)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!stencil0->enabled)
-      return 0;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 359:
-    *
-    *     "If the Depth Buffer is either undefined or does not have a surface
-    *      format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate
-    *      stencil buffer is disabled, Stencil Test Enable must be DISABLED"
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 370:
-    *
-    *     "This field (Stencil Test Enable) cannot be enabled if
-    *      Surface Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM."
-    *
-    * TODO We do not check these yet.
-    */
-   dw = GEN6_ZS_DW0_STENCIL_TEST_ENABLE |
-        gen6_translate_dsa_func(stencil0->func) << 28 |
-        gen6_translate_pipe_stencil_op(stencil0->fail_op) << 25 |
-        gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 22 |
-        gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 19;
-   if (stencil0->writemask)
-      dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
-
-   if (stencil1->enabled) {
-      dw |= GEN6_ZS_DW0_STENCIL1_ENABLE |
-            gen6_translate_dsa_func(stencil1->func) << 12 |
-            gen6_translate_pipe_stencil_op(stencil1->fail_op) << 9 |
-            gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 6 |
-            gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 3;
-      if (stencil1->writemask)
-         dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
-   }
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_stencil_enable_gen8(const struct ilo_dev *dev,
-                            const struct pipe_stencil_state *stencil0,
-                            const struct pipe_stencil_state *stencil1)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!stencil0->enabled)
-      return 0;
-
-   dw = gen6_translate_pipe_stencil_op(stencil0->fail_op) << 29 |
-        gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 26 |
-        gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 23 |
-        gen6_translate_dsa_func(stencil0->func) << 8 |
-        GEN8_ZS_DW1_STENCIL_TEST_ENABLE;
-   if (stencil0->writemask)
-      dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
-
-   if (stencil1->enabled) {
-      dw |= gen6_translate_dsa_func(stencil1->func) << 20 |
-            gen6_translate_pipe_stencil_op(stencil1->fail_op) << 17 |
-            gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 14 |
-            gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 11 |
-            GEN8_ZS_DW1_STENCIL1_ENABLE;
-      if (stencil1->writemask)
-         dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
-   }
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_depth_enable_gen6(const struct ilo_dev *dev,
-                          const struct pipe_depth_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 360:
-    *
-    *     "Enabling the Depth Test function without defining a Depth Buffer is
-    *      UNDEFINED."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 375:
-    *
-    *     "A Depth Buffer must be defined before enabling writes to it, or
-    *      operation is UNDEFINED."
-    *
-    * TODO We do not check these yet.
-    */
-   if (state->enabled) {
-      dw = GEN6_ZS_DW2_DEPTH_TEST_ENABLE |
-           gen6_translate_dsa_func(state->func) << 27;
-   } else {
-      dw = GEN6_COMPAREFUNCTION_ALWAYS << 27;
-   }
-
-   if (state->writemask)
-      dw |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_depth_enable_gen8(const struct ilo_dev *dev,
-                          const struct pipe_depth_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->enabled) {
-      dw = GEN8_ZS_DW1_DEPTH_TEST_ENABLE |
-           gen6_translate_dsa_func(state->func) << 5;
-   } else {
-      dw = GEN6_COMPAREFUNCTION_ALWAYS << 5;
-   }
-
-   if (state->writemask)
-      dw |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_alpha_enable_gen6(const struct ilo_dev *dev,
-                          const struct pipe_alpha_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!state->enabled)
-      return 0;
-
-   /* this will be ORed to BLEND_STATE */
-   dw = GEN6_RT_DW1_ALPHA_TEST_ENABLE |
-        gen6_translate_dsa_func(state->func) << 13;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_alpha_enable_gen8(const struct ilo_dev *dev,
-                          const struct pipe_alpha_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!state->enabled)
-      return 0;
-
-   /* this will be ORed to BLEND_STATE */
-   dw = GEN8_BLEND_DW0_ALPHA_TEST_ENABLE |
-        gen6_translate_dsa_func(state->func) << 24;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_dsa(const struct ilo_dev *dev,
-                 const struct pipe_depth_stencil_alpha_state *state,
-                 struct ilo_dsa_state *dsa)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   STATIC_ASSERT(Elements(dsa->payload) >= 3);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      const uint32_t dw_stencil = dsa_get_stencil_enable_gen8(dev,
-            &state->stencil[0], &state->stencil[1]);
-      const uint32_t dw_depth = dsa_get_depth_enable_gen8(dev, &state->depth);
-
-      assert(!(dw_stencil & dw_depth));
-      dsa->payload[0] = dw_stencil | dw_depth;
-
-      dsa->dw_blend_alpha = dsa_get_alpha_enable_gen8(dev, &state->alpha);
-      dsa->dw_ps_blend_alpha = (state->alpha.enabled) ?
-         GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE : 0;
-   } else {
-      dsa->payload[0] = dsa_get_stencil_enable_gen6(dev,
-            &state->stencil[0], &state->stencil[1]);
-      dsa->payload[2] = dsa_get_depth_enable_gen6(dev, &state->depth);
-
-      dsa->dw_blend_alpha = dsa_get_alpha_enable_gen6(dev, &state->alpha);
-      dsa->dw_ps_blend_alpha = 0;
-   }
-
-   dsa->payload[1] = state->stencil[0].valuemask << 24 |
-                     state->stencil[0].writemask << 16 |
-                     state->stencil[1].valuemask << 8 |
-                     state->stencil[1].writemask;
-
-   dsa->alpha_ref = float_to_ubyte(state->alpha.ref_value);
-}
-
-void
-ilo_gpe_set_scissor(const struct ilo_dev *dev,
-                    unsigned start_slot,
-                    unsigned num_states,
-                    const struct pipe_scissor_state *states,
-                    struct ilo_scissor_state *scissor)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   for (i = 0; i < num_states; i++) {
-      uint16_t min_x, min_y, max_x, max_y;
-
-      /* both max and min are inclusive in SCISSOR_RECT */
-      if (states[i].minx < states[i].maxx &&
-          states[i].miny < states[i].maxy) {
-         min_x = states[i].minx;
-         min_y = states[i].miny;
-         max_x = states[i].maxx - 1;
-         max_y = states[i].maxy - 1;
-      }
-      else {
-         /* we have to make min greater than max */
-         min_x = 1;
-         min_y = 1;
-         max_x = 0;
-         max_y = 0;
-      }
-
-      scissor->payload[(start_slot + i) * 2 + 0] = min_y << 16 | min_x;
-      scissor->payload[(start_slot + i) * 2 + 1] = max_y << 16 | max_x;
-   }
-
-   if (!start_slot && num_states)
-      scissor->scissor0 = states[0];
-}
-
-void
-ilo_gpe_set_scissor_null(const struct ilo_dev *dev,
-                         struct ilo_scissor_state *scissor)
-{
-   unsigned i;
-
-   for (i = 0; i < Elements(scissor->payload); i += 2) {
-      scissor->payload[i + 0] = 1 << 16 | 1;
-      scissor->payload[i + 1] = 0;
-   }
-}
-
-static void
-fb_set_blend_caps(const struct ilo_dev *dev,
-                  enum pipe_format format,
-                  struct ilo_fb_blend_caps *caps)
-{
-   const struct util_format_description *desc =
-      util_format_description(format);
-   const int ch = util_format_get_first_non_void_channel(format);
-
-   memset(caps, 0, sizeof(*caps));
-
-   if (format == PIPE_FORMAT_NONE || desc->is_mixed)
-      return;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
-    *
-    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
-    *      variants), otherwise Logic Ops must be DISABLED."
-    *
-    * According to the classic driver, this is lifted on Gen8+.
-    */
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      caps->can_logicop = true;
-   } else {
-      caps->can_logicop = (ch >= 0 && desc->channel[ch].normalized &&
-            desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED &&
-            desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
-   }
-
-   /* no blending for pure integer formats */
-   caps->can_blend = !util_format_is_pure_integer(format);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
-    *
-    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
-    *      alpha value."
-    */
-   caps->can_alpha_test = !util_format_is_pure_integer(format);
-
-   caps->dst_alpha_forced_one =
-      (ilo_format_translate_render(dev, format) !=
-       ilo_format_translate_color(dev, format));
-
-   /* sanity check */
-   if (caps->dst_alpha_forced_one) {
-      enum pipe_format render_format;
-
-      switch (format) {
-      case PIPE_FORMAT_B8G8R8X8_UNORM:
-         render_format = PIPE_FORMAT_B8G8R8A8_UNORM;
-         break;
-      default:
-         render_format = PIPE_FORMAT_NONE;
-         break;
-      }
-
-      assert(ilo_format_translate_render(dev, format) ==
-             ilo_format_translate_color(dev, render_format));
-   }
-}
-
-void
-ilo_gpe_set_fb(const struct ilo_dev *dev,
-               const struct pipe_framebuffer_state *state,
-               struct ilo_fb_state *fb)
-{
-   const struct pipe_surface *first_surf = NULL;
-   int i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   util_copy_framebuffer_state(&fb->state, state);
-
-   ilo_gpe_init_view_surface_null(dev,
-         (state->width) ? state->width : 1,
-         (state->height) ? state->height : 1,
-         1, 0, &fb->null_rt);
-
-   for (i = 0; i < state->nr_cbufs; i++) {
-      if (state->cbufs[i]) {
-         fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]);
-
-         if (!first_surf)
-            first_surf = state->cbufs[i];
-      } else {
-         fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]);
-      }
-   }
-
-   if (!first_surf && state->zsbuf)
-      first_surf = state->zsbuf;
-
-   fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1;
-   if (!fb->num_samples)
-      fb->num_samples = 1;
-
-   /*
-    * The PRMs list several restrictions when the framebuffer has more than
-    * one surface.  It seems they are actually lifted on GEN6+.
-    */
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c b/src/gallium/drivers/ilo/core/ilo_state_3d_top.c
deleted file mode 100644
index c17957fb704..00000000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c
+++ /dev/null
@@ -1,1716 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <[email protected]>
- */
-
-#include "genhw/genhw.h"
-#include "util/u_dual_blend.h"
-#include "util/u_framebuffer.h"
-#include "util/u_half.h"
-#include "util/u_resource.h"
-
-#include "ilo_buffer.h"
-#include "ilo_format.h"
-#include "ilo_image.h"
-#include "ilo_state_3d.h"
-#include "../ilo_shader.h"
-
-static void
-ve_init_cso(const struct ilo_dev *dev,
-            const struct pipe_vertex_element *state,
-            unsigned vb_index,
-            struct ilo_ve_cso *cso)
-{
-   int comp[4] = {
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-   };
-   int format;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (util_format_get_nr_components(state->src_format)) {
-   case 1: comp[1] = GEN6_VFCOMP_STORE_0;
-   case 2: comp[2] = GEN6_VFCOMP_STORE_0;
-   case 3: comp[3] = (util_format_is_pure_integer(state->src_format)) ?
-                     GEN6_VFCOMP_STORE_1_INT :
-                     GEN6_VFCOMP_STORE_1_FP;
-   }
-
-   format = ilo_format_translate_vertex(dev, state->src_format);
-
-   STATIC_ASSERT(Elements(cso->payload) >= 2);
-   cso->payload[0] =
-      vb_index << GEN6_VE_DW0_VB_INDEX__SHIFT |
-      GEN6_VE_DW0_VALID |
-      format << GEN6_VE_DW0_FORMAT__SHIFT |
-      state->src_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
-
-   cso->payload[1] =
-         comp[0] << GEN6_VE_DW1_COMP0__SHIFT |
-         comp[1] << GEN6_VE_DW1_COMP1__SHIFT |
-         comp[2] << GEN6_VE_DW1_COMP2__SHIFT |
-         comp[3] << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_ve(const struct ilo_dev *dev,
-                unsigned num_states,
-                const struct pipe_vertex_element *states,
-                struct ilo_ve_state *ve)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   ve->count = num_states;
-   ve->vb_count = 0;
-
-   for (i = 0; i < num_states; i++) {
-      const unsigned pipe_idx = states[i].vertex_buffer_index;
-      const unsigned instance_divisor = states[i].instance_divisor;
-      unsigned hw_idx;
-
-      /*
-       * map the pipe vb to the hardware vb, which has a fixed instance
-       * divisor
-       */
-      for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
-         if (ve->vb_mapping[hw_idx] == pipe_idx &&
-             ve->instance_divisors[hw_idx] == instance_divisor)
-            break;
-      }
-
-      /* create one if there is no matching hardware vb */
-      if (hw_idx >= ve->vb_count) {
-         hw_idx = ve->vb_count++;
-
-         ve->vb_mapping[hw_idx] = pipe_idx;
-         ve->instance_divisors[hw_idx] = instance_divisor;
-      }
-
-      ve_init_cso(dev, &states[i], hw_idx, &ve->cso[i]);
-   }
-}
-
-void
-ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev,
-                        struct ilo_ve_cso *cso)
-{
-   int format;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
-    *
-    *     "- This bit (Edge Flag Enable) must only be ENABLED on the last
-    *        valid VERTEX_ELEMENT structure.
-    *
-    *      - When set, Component 0 Control must be set to VFCOMP_STORE_SRC,
-    *        and Component 1-3 Control must be set to VFCOMP_NOSTORE.
-    *
-    *      - The Source Element Format must be set to the UINT format.
-    *
-    *      - [DevSNB]: Edge Flags are not supported for QUADLIST
-    *        primitives.  Software may elect to convert QUADLIST primitives
-    *        to some set of corresponding edge-flag-supported primitive
-    *        types (e.g., POLYGONs) prior to submission to the 3D pipeline."
-    */
-   cso->payload[0] |= GEN6_VE_DW0_EDGE_FLAG_ENABLE;
-
-   /*
-    * Edge flags have format GEN6_FORMAT_R8_USCALED when defined via
-    * glEdgeFlagPointer(), and format GEN6_FORMAT_R32_FLOAT when defined
-    * via glEdgeFlag(), as can be seen in vbo_attrib_tmp.h.
-    *
-    * Since all the hardware cares about is whether the flags are zero or not,
-    * we can treat them as the corresponding _UINT formats.
-    */
-   format = GEN_EXTRACT(cso->payload[0], GEN6_VE_DW0_FORMAT);
-   cso->payload[0] &= ~GEN6_VE_DW0_FORMAT__MASK;
-
-   switch (format) {
-   case GEN6_FORMAT_R32_FLOAT:
-      format = GEN6_FORMAT_R32_UINT;
-      break;
-   case GEN6_FORMAT_R8_USCALED:
-      format = GEN6_FORMAT_R8_UINT;
-      break;
-   default:
-      break;
-   }
-
-   cso->payload[0] |= GEN_SHIFT32(format, GEN6_VE_DW0_FORMAT);
-
-   cso->payload[1] =
-         GEN6_VFCOMP_STORE_SRC << GEN6_VE_DW1_COMP0__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP1__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP2__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev,
-                          int comp0, int comp1, int comp2, int comp3,
-                          struct ilo_ve_cso *cso)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   STATIC_ASSERT(Elements(cso->payload) >= 2);
-
-   assert(comp0 != GEN6_VFCOMP_STORE_SRC &&
-          comp1 != GEN6_VFCOMP_STORE_SRC &&
-          comp2 != GEN6_VFCOMP_STORE_SRC &&
-          comp3 != GEN6_VFCOMP_STORE_SRC);
-
-   cso->payload[0] = GEN6_VE_DW0_VALID;
-   cso->payload[1] =
-         comp0 << GEN6_VE_DW1_COMP0__SHIFT |
-         comp1 << GEN6_VE_DW1_COMP1__SHIFT |
-         comp2 << GEN6_VE_DW1_COMP2__SHIFT |
-         comp3 << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_vs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *vs,
-                    struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   start_grf = ilo_shader_get_kernel_param(vs, ILO_KERNEL_URB_DATA_START_REG);
-   vue_read_len = ilo_shader_get_kernel_param(vs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(vs, ILO_KERNEL_SAMPLER_COUNT);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 135:
-    *
-    *     "(Vertex URB Entry Read Length) Specifies the number of pairs of
-    *      128-bit vertex elements to be passed into the payload for each
-    *      vertex."
-    *
-    *     "It is UNDEFINED to set this field to 0 indicating no Vertex URB
-    *      data to be read and passed to the thread."
-    */
-   vue_read_len = (vue_read_len + 1) / 2;
-   if (!vue_read_len)
-      vue_read_len = 1;
-
-   max_threads = dev->thread_count;
-   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 2)
-      max_threads *= 2;
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = start_grf << GEN6_VS_DW4_URB_GRF_START__SHIFT |
-         vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
-         0 << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT;
-
-   dw5 = GEN6_VS_DW5_STATISTICS |
-         GEN6_VS_DW5_VS_ENABLE;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
-      dw5 |= (max_threads - 1) << GEN75_VS_DW5_MAX_THREADS__SHIFT;
-   else
-      dw5 |= (max_threads - 1) << GEN6_VS_DW5_MAX_THREADS__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 3);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-}
-
-static void
-gs_init_cso_gen6(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *gs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, max_threads;
-   uint32_t dw2, dw4, dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   if (ilo_shader_get_type(gs) == PIPE_SHADER_GEOMETRY) {
-      start_grf = ilo_shader_get_kernel_param(gs,
-            ILO_KERNEL_URB_DATA_START_REG);
-
-      vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT);
-   }
-   else {
-      start_grf = ilo_shader_get_kernel_param(gs,
-            ILO_KERNEL_VS_GEN6_SO_START_REG);
-
-      vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_OUTPUT_COUNT);
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 153:
-    *
-    *     "Specifies the amount of URB data read and passed in the thread
-    *      payload for each Vertex URB entry, in 256-bit register increments.
-    *
-    *      It is UNDEFINED to set this field (Vertex URB Entry Read Length) to
-    *      0 indicating no Vertex URB data to be read and passed to the
-    *      thread."
-    */
-   vue_read_len = (vue_read_len + 1) / 2;
-   if (!vue_read_len)
-      vue_read_len = 1;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 154:
-    *
-    *     "Maximum Number of Threads valid range is [0,27] when Rendering
-    *      Enabled bit is set."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 173:
-    *
-    *     "Programming Note: If the GS stage is enabled, software must always
-    *      allocate at least one GS URB Entry. This is true even if the GS
-    *      thread never needs to output vertices to the pipeline, e.g., when
-    *      only performing stream output. This is an artifact of the need to
-    *      pass the GS thread an initial destination URB handle."
-    *
-    * As such, we always enable rendering, and limit the number of threads.
-    */
-   if (dev->gt == 2) {
-      /* maximum is 60, but limited to 28 */
-      max_threads = 28;
-   }
-   else {
-      /* maximum is 24, but limited to 21 (see brwCreateContext()) */
-      max_threads = 21;
-   }
-
-   dw2 = GEN6_THREADDISP_SPF;
-
-   dw4 = vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
-         0 << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
-         start_grf << GEN6_GS_DW4_URB_GRF_START__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN6_GS_DW5_MAX_THREADS__SHIFT |
-         GEN6_GS_DW5_STATISTICS |
-         GEN6_GS_DW5_SO_STATISTICS |
-         GEN6_GS_DW5_RENDER_ENABLE;
-
-   /*
-    * we cannot make use of GEN6_GS_REORDER because it will reorder
-    * triangle strips according to D3D rules (triangle 2N+1 uses vertices
-    * (2N+1, 2N+3, 2N+2)), instead of GL rules (triangle 2N+1 uses vertices
-    * (2N+2, 2N+1, 2N+3)).
-    */
-   dw6 = GEN6_GS_DW6_GS_ENABLE;
-
-   if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_DISCARD_ADJACENCY))
-      dw6 |= GEN6_GS_DW6_DISCARD_ADJACENCY;
-
-   if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_VS_GEN6_SO)) {
-      const uint32_t svbi_post_inc =
-         ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_GEN6_SVBI_POST_INC);
-
-      dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE;
-      if (svbi_post_inc) {
-         dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE |
-                svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT;
-      }
-   }
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = dw6;
-}
-
-static void
-gs_init_cso_gen7(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *gs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   start_grf = ilo_shader_get_kernel_param(gs, ILO_KERNEL_URB_DATA_START_REG);
-   vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(gs, ILO_KERNEL_SAMPLER_COUNT);
-
-   /* in pairs */
-   vue_read_len = (vue_read_len + 1) / 2;
-
-   switch (ilo_dev_gen(dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (dev->gt >= 2) ? 256 : 70;
-      break;
-   case ILO_GEN(7):
-      max_threads = (dev->gt == 2) ? 128 : 36;
-      break;
-   default:
-      max_threads = 1;
-      break;
-   }
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT |
-         GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES |
-         0 << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT |
-         start_grf << GEN7_GS_DW4_URB_GRF_START__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN7_GS_DW5_MAX_THREADS__SHIFT |
-         GEN7_GS_DW5_STATISTICS |
-         GEN7_GS_DW5_GS_ENABLE;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 3);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-}
-
-void
-ilo_gpe_init_gs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *gs,
-                    struct ilo_shader_cso *cso)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      gs_init_cso_gen7(dev, gs, cso);
-   else
-      gs_init_cso_gen6(dev, gs, cso);
-}
-
-static void
-view_init_null_gen6(const struct ilo_dev *dev,
-                    unsigned width, unsigned height,
-                    unsigned depth, unsigned level,
-                    struct ilo_view_surface *surf)
-{
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   assert(width >= 1 && height >= 1 && depth >= 1);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 71:
-    *
-    *     "A null surface will be used in instances where an actual surface is
-    *      not bound. When a write message is generated to a null surface, no
-    *      actual surface is written to. When a read message (including any
-    *      sampling engine message) is generated to a null surface, the result
-    *      is all zeros. Note that a null surface type is allowed to be used
-    *      with all messages, even if it is not specificially indicated as
-    *      supported. All of the remaining fields in surface state are ignored
-    *      for null surfaces, with the following exceptions:
-    *
-    *        * [DevSNB+]: Width, Height, Depth, and LOD fields must match the
-    *          depth buffer's corresponding state for all render target
-    *          surfaces, including null.
-    *        * Surface Format must be R8G8B8A8_UNORM."
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 82:
-    *
-    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be
-    *      true"
-    */
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           GEN6_FORMAT_B8G8R8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT;
-
-   dw[1] = 0;
-
-   dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           (width  - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT |
-           level << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
-
-   dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           GEN6_TILING_X;
-
-   dw[4] = 0;
-   dw[5] = 0;
-}
-
-static void
-view_init_for_buffer_gen6(const struct ilo_dev *dev,
-                          const struct ilo_buffer *buf,
-                          unsigned offset, unsigned size,
-                          unsigned struct_size,
-                          enum pipe_format elem_format,
-                          bool is_rt, bool render_cache_rw,
-                          struct ilo_view_surface *surf)
-{
-   const int elem_size = util_format_get_blocksize(elem_format);
-   int width, height, depth, pitch;
-   int surface_format, num_entries;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   /*
-    * For SURFTYPE_BUFFER, a SURFACE_STATE specifies an element of a
-    * structure in a buffer.
-    */
-
-   surface_format = ilo_format_translate_color(dev, elem_format);
-
-   num_entries = size / struct_size;
-   /* see if there is enough space to fit another element */
-   if (size % struct_size >= elem_size)
-      num_entries++;
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 76:
-    *
-    *     "For SURFTYPE_BUFFER render targets, this field (Surface Base
-    *      Address) specifies the base address of first element of the
-    *      surface. The surface is interpreted as a simple array of that
-    *      single element type. The address must be naturally-aligned to the
-    *      element size (e.g., a buffer containing R32G32B32A32_FLOAT elements
-    *      must be 16-byte aligned).
-    *
-    *      For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies
-    *      the base address of the first element of the surface, computed in
-    *      software by adding the surface base address to the byte offset of
-    *      the element in the buffer."
-    */
-   if (is_rt)
-      assert(offset % elem_size == 0);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 77:
-    *
-    *     "For buffer surfaces, the number of entries in the buffer ranges
-    *      from 1 to 2^27."
-    */
-   assert(num_entries >= 1 && num_entries <= 1 << 27);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-    *
-    *     "For surfaces of type SURFTYPE_BUFFER, this field (Surface Pitch)
-    *      indicates the size of the structure."
-    */
-   pitch = struct_size;
-
-   pitch--;
-   num_entries--;
-   /* bits [6:0] */
-   width  = (num_entries & 0x0000007f);
-   /* bits [19:7] */
-   height = (num_entries & 0x000fff80) >> 7;
-   /* bits [26:20] */
-   depth  = (num_entries & 0x07f00000) >> 20;
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT;
-   if (render_cache_rw)
-      dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW;
-
-   dw[1] = offset;
-
-   dw[2] = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           width << GEN6_SURFACE_DW2_WIDTH__SHIFT;
-
-   dw[3] = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           pitch << GEN6_SURFACE_DW3_PITCH__SHIFT;
-
-   dw[4] = 0;
-   dw[5] = 0;
-}
-
-static void
-view_init_for_image_gen6(const struct ilo_dev *dev,
-                         const struct ilo_image *img,
-                         enum pipe_texture_target target,
-                         enum pipe_format format,
-                         unsigned first_level,
-                         unsigned num_levels,
-                         unsigned first_layer,
-                         unsigned num_layers,
-                         bool is_rt,
-                         struct ilo_view_surface *surf)
-{
-   int surface_type, surface_format;
-   int width, height, depth, pitch, lod;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   surface_type = ilo_gpe_gen6_translate_texture(target);
-   assert(surface_type != GEN6_SURFTYPE_BUFFER);
-
-   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil)
-      format = PIPE_FORMAT_Z32_FLOAT;
-
-   if (is_rt)
-      surface_format = ilo_format_translate_render(dev, format);
-   else
-      surface_format = ilo_format_translate_texture(dev, format);
-   assert(surface_format >= 0);
-
-   width = img->width0;
-   height = img->height0;
-   depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-   pitch = img->bo_stride;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-       *
-       *     "For SURFTYPE_CUBE: [DevSNB+]: for Sampling Engine Surfaces, the
-       *      range of this field (Depth) is [0,84], indicating the number of
-       *      cube array elements (equal to the number of underlying 2D array
-       *      elements divided by 6). For other surfaces, this field must be
-       *      zero."
-       *
-       * When is_rt is true, we treat the texture as a 2D one to avoid the
-       * restriction.
-       */
-      if (is_rt) {
-         surface_type = GEN6_SURFTYPE_2D;
-      }
-      else {
-         assert(num_layers % 6 == 0);
-         depth = num_layers / 6;
-      }
-   }
-
-   /* sanity check the size */
-   assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1);
-   switch (surface_type) {
-   case GEN6_SURFTYPE_1D:
-      assert(width <= 8192 && height == 1 && depth <= 512);
-      assert(first_layer < 512 && num_layers <= 512);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(width <= 8192 && height <= 8192 && depth <= 512);
-      assert(first_layer < 512 && num_layers <= 512);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(width <= 2048 && height <= 2048 && depth <= 2048);
-      assert(first_layer < 2048 && num_layers <= 512);
-      if (!is_rt)
-         assert(first_layer == 0);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(width <= 8192 && height <= 8192 && depth <= 85);
-      assert(width == height);
-      assert(first_layer < 512 && num_layers <= 512);
-      if (is_rt)
-         assert(first_layer == 0);
-      break;
-   default:
-      assert(!"unexpected surface type");
-      break;
-   }
-
-   /* non-full array spacing is supported only on GEN7+ */
-   assert(img->walk != ILO_IMAGE_WALK_LOD);
-   /* non-interleaved samples are supported only on GEN7+ */
-   if (img->sample_count > 1)
-      assert(img->interleaved_samples);
-
-   if (is_rt) {
-      assert(num_levels == 1);
-      lod = first_level;
-   }
-   else {
-      lod = num_levels - 1;
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 76:
-    *
-    *     "Linear render target surface base addresses must be element-size
-    *      aligned, for non-YUV surface formats, or a multiple of 2
-    *      element-sizes for YUV surface formats. Other linear surfaces have
-    *      no alignment requirements (byte alignment is sufficient.)"
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-    *
-    *     "For linear render target surfaces, the pitch must be a multiple
-    *      of the element size for non-YUV surface formats. Pitch must be a
-    *      multiple of 2 * element size for YUV surface formats."
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 86:
-    *
-    *     "For linear surfaces, this field (X Offset) must be zero"
-    */
-   if (img->tiling == GEN6_TILING_NONE) {
-      if (is_rt) {
-         const int elem_size = util_format_get_blocksize(format);
-         assert(pitch % elem_size == 0);
-      }
-   }
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
-           GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt) {
-      dw[0] |= 1 << 9 |
-               GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
-   }
-
-   if (is_rt)
-      dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW;
-
-   dw[1] = 0;
-
-   dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           (width - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT |
-           lod << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
-
-   assert(img->tiling != GEN8_TILING_W);
-   dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           (pitch - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT |
-           img->tiling;
-
-   dw[4] = first_level << GEN6_SURFACE_DW4_MIN_LOD__SHIFT |
-           first_layer << 17 |
-           (num_layers - 1) << 8 |
-           ((img->sample_count > 1) ? GEN6_SURFACE_DW4_MULTISAMPLECOUNT_4 :
-                                      GEN6_SURFACE_DW4_MULTISAMPLECOUNT_1);
-
-   dw[5] = 0;
-
-   assert(img->align_j == 2 || img->align_j == 4);
-   if (img->align_j == 4)
-      dw[5] |= GEN6_SURFACE_DW5_VALIGN_4;
-}
-
-static void
-view_init_null_gen7(const struct ilo_dev *dev,
-                    unsigned width, unsigned height,
-                    unsigned depth, unsigned level,
-                    struct ilo_view_surface *surf)
-{
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   assert(width >= 1 && height >= 1 && depth >= 1);
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 62:
-    *
-    *     "A null surface is used in instances where an actual surface is not
-    *      bound. When a write message is generated to a null surface, no
-    *      actual surface is written to. When a read message (including any
-    *      sampling engine message) is generated to a null surface, the result
-    *      is all zeros.  Note that a null surface type is allowed to be used
-    *      with all messages, even if it is not specificially indicated as
-    *      supported. All of the remaining fields in surface state are ignored
-    *      for null surfaces, with the following exceptions:
-    *
-    *      * Width, Height, Depth, LOD, and Render Target View Extent fields
-    *        must match the depth buffer's corresponding state for all render
-    *        target surfaces, including null.
-    *      * All sampling engine and data port messages support null surfaces
-    *        with the above behavior, even if not mentioned as specifically
-    *        supported, except for the following:
-    *        * Data Port Media Block Read/Write messages.
-    *      * The Surface Type of a surface used as a render target (accessed
-    *        via the Data Port's Render Target Write message) must be the same
-    *        as the Surface Type of all other render targets and of the depth
-    *        buffer (defined in 3DSTATE_DEPTH_BUFFER), unless either the depth
-    *        buffer or render targets are SURFTYPE_NULL."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 65:
-    *
-    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be
-    *      true"
-    */
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           GEN6_FORMAT_B8G8R8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      dw[0] |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT;
-   else
-      dw[0] |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT;
-
-   dw[1] = 0;
-
-   dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width  - 1, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH);
-
-   dw[4] = 0;
-   dw[5] = level;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      memset(&dw[8], 0, sizeof(*dw) * (13 - 8));
-}
-
-static void
-view_init_for_buffer_gen7(const struct ilo_dev *dev,
-                          const struct ilo_buffer *buf,
-                          unsigned offset, unsigned size,
-                          unsigned struct_size,
-                          enum pipe_format elem_format,
-                          bool is_rt, bool render_cache_rw,
-                          struct ilo_view_surface *surf)
-{
-   const bool typed = (elem_format != PIPE_FORMAT_NONE);
-   const bool structured = (!typed && struct_size > 1);
-   const int elem_size = (typed) ?
-      util_format_get_blocksize(elem_format) : 1;
-   int width, height, depth, pitch;
-   int surface_type, surface_format, num_entries;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   surface_type = (structured) ? GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER;
-
-   surface_format = (typed) ?
-      ilo_format_translate_color(dev, elem_format) : GEN6_FORMAT_RAW;
-
-   num_entries = size / struct_size;
-   /* see if there is enough space to fit another element */
-   if (size % struct_size >= elem_size && !structured)
-      num_entries++;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 67:
-    *
-    *     "For SURFTYPE_BUFFER render targets, this field (Surface Base
-    *      Address) specifies the base address of first element of the
-    *      surface. The surface is interpreted as a simple array of that
-    *      single element type. The address must be naturally-aligned to the
-    *      element size (e.g., a buffer containing R32G32B32A32_FLOAT elements
-    *      must be 16-byte aligned)
-    *
-    *      For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies
-    *      the base address of the first element of the surface, computed in
-    *      software by adding the surface base address to the byte offset of
-    *      the element in the buffer."
-    */
-   if (is_rt)
-      assert(offset % elem_size == 0);
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
-    *
-    *     "For typed buffer and structured buffer surfaces, the number of
-    *      entries in the buffer ranges from 1 to 2^27.  For raw buffer
-    *      surfaces, the number of entries in the buffer is the number of
-    *      bytes which can range from 1 to 2^30."
-    */
-   assert(num_entries >= 1 &&
-          num_entries <= 1 << ((typed || structured) ? 27 : 30));
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
-    *
-    *     "For SURFTYPE_BUFFER: The low two bits of this field (Width) must be
-    *      11 if the Surface Format is RAW (the size of the buffer must be a
-    *      multiple of 4 bytes)."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-    *
-    *     "For surfaces of type SURFTYPE_BUFFER and SURFTYPE_STRBUF, this
-    *      field (Surface Pitch) indicates the size of the structure."
-    *
-    *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the pitch
-    *      must be a multiple of 4 bytes."
-    */
-   if (structured)
-      assert(struct_size % 4 == 0);
-   else if (!typed)
-      assert(num_entries % 4 == 0);
-
-   pitch = struct_size;
-
-   pitch--;
-   num_entries--;
-   /* bits [6:0] */
-   width  = (num_entries & 0x0000007f);
-   /* bits [20:7] */
-   height = (num_entries & 0x001fff80) >> 7;
-   /* bits [30:21] */
-   depth  = (num_entries & 0x7fe00000) >> 21;
-   /* limit to [26:21] */
-   if (typed || structured)
-      depth &= 0x3f;
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-   if (render_cache_rw)
-      dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      dw[8] = offset;
-      memset(&dw[9], 0, sizeof(*dw) * (13 - 9));
-   } else {
-      dw[1] = offset;
-   }
-
-   dw[2] = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) |
-           pitch;
-
-   dw[4] = 0;
-   dw[5] = 0;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      dw[7] |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
-               GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
-               GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
-               GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
-   }
-}
-
-static void
-view_init_for_image_gen7(const struct ilo_dev *dev,
-                         const struct ilo_image *img,
-                         enum pipe_texture_target target,
-                         enum pipe_format format,
-                         unsigned first_level,
-                         unsigned num_levels,
-                         unsigned first_layer,
-                         unsigned num_layers,
-                         bool is_rt,
-                         struct ilo_view_surface *surf)
-{
-   int surface_type, surface_format;
-   int width, height, depth, pitch, lod;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   surface_type = ilo_gpe_gen6_translate_texture(target);
-   assert(surface_type != GEN6_SURFTYPE_BUFFER);
-
-   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil)
-      format = PIPE_FORMAT_Z32_FLOAT;
-
-   if (is_rt)
-      surface_format = ilo_format_translate_render(dev, format);
-   else
-      surface_format = ilo_format_translate_texture(dev, format);
-   assert(surface_format >= 0);
-
-   width = img->width0;
-   height = img->height0;
-   depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-   pitch = img->bo_stride;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-       *
-       *     "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of
-       *      this field is [0,340], indicating the number of cube array
-       *      elements (equal to the number of underlying 2D array elements
-       *      divided by 6). For other surfaces, this field must be zero."
-       *
-       * When is_rt is true, we treat the texture as a 2D one to avoid the
-       * restriction.
-       */
-      if (is_rt) {
-         surface_type = GEN6_SURFTYPE_2D;
-      }
-      else {
-         assert(num_layers % 6 == 0);
-         depth = num_layers / 6;
-      }
-   }
-
-   /* sanity check the size */
-   assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1);
-   assert(first_layer < 2048 && num_layers <= 2048);
-   switch (surface_type) {
-   case GEN6_SURFTYPE_1D:
-      assert(width <= 16384 && height == 1 && depth <= 2048);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(width <= 16384 && height <= 16384 && depth <= 2048);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(width <= 2048 && height <= 2048 && depth <= 2048);
-      if (!is_rt)
-         assert(first_layer == 0);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(width <= 16384 && height <= 16384 && depth <= 86);
-      assert(width == height);
-      if (is_rt)
-         assert(first_layer == 0);
-      break;
-   default:
-      assert(!"unexpected surface type");
-      break;
-   }
-
-   if (is_rt) {
-      assert(num_levels == 1);
-      lod = first_level;
-   }
-   else {
-      lod = num_levels - 1;
-   }
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
-    *
-    *     "The Base Address for linear render target surfaces and surfaces
-    *      accessed with the typed surface read/write data port messages must
-    *      be element-size aligned, for non-YUV surface formats, or a multiple
-    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
-    *      have no alignment requirements (byte alignment is sufficient)."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-    *
-    *     "For linear render target surfaces and surfaces accessed with the
-    *      typed data port messages, the pitch must be a multiple of the
-    *      element size for non-YUV surface formats. Pitch must be a multiple
-    *      of 2 * element size for YUV surface formats. For linear surfaces
-    *      with Surface Type of SURFTYPE_STRBUF, the pitch must be a multiple
-    *      of 4 bytes.For other linear surfaces, the pitch can be any multiple
-    *      of bytes."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 74:
-    *
-    *     "For linear surfaces, this field (X Offset) must be zero."
-    */
-   if (img->tiling == GEN6_TILING_NONE) {
-      if (is_rt) {
-         const int elem_size = util_format_get_blocksize(format);
-         assert(pitch % elem_size == 0);
-      }
-   }
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-    *
-    *     "If this field (Surface Array) is enabled, the Surface Type must be
-    *      SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is
-    *      disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or
-    *      SURFTYPE_CUBE, the Depth field must be set to zero."
-    *
-    * For non-3D sampler surfaces, resinfo (the sampler message) always
-    * returns zero for the number of layers when this field is not set.
-    */
-   if (surface_type != GEN6_SURFTYPE_3D) {
-      switch (target) {
-      case PIPE_TEXTURE_1D_ARRAY:
-      case PIPE_TEXTURE_2D_ARRAY:
-      case PIPE_TEXTURE_CUBE_ARRAY:
-         dw[0] |= GEN7_SURFACE_DW0_IS_ARRAY;
-         break;
-      default:
-         assert(depth == 1);
-         break;
-      }
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      switch (img->align_j) {
-      case 4:
-         dw[0] |= GEN7_SURFACE_DW0_VALIGN_4;
-         break;
-      case 8:
-         dw[0] |= GEN8_SURFACE_DW0_VALIGN_8;
-         break;
-      case 16:
-         dw[0] |= GEN8_SURFACE_DW0_VALIGN_16;
-         break;
-      default:
-         assert(!"unsupported valign");
-         break;
-      }
-
-      switch (img->align_i) {
-      case 4:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_4;
-         break;
-      case 8:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_8;
-         break;
-      case 16:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_16;
-         break;
-      default:
-         assert(!"unsupported halign");
-         break;
-      }
-
-      dw[0] |= img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT;
-   } else {
-      assert(img->align_i == 4 || img->align_i == 8);
-      assert(img->align_j == 2 || img->align_j == 4);
-
-      if (img->align_j == 4)
-         dw[0] |= GEN7_SURFACE_DW0_VALIGN_4;
-
-      if (img->align_i == 8)
-         dw[0] |= GEN7_SURFACE_DW0_HALIGN_8;
-
-      assert(img->tiling != GEN8_TILING_W);
-      dw[0] |= img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT;
-
-      if (img->walk == ILO_IMAGE_WALK_LOD)
-         dw[0] |= GEN7_SURFACE_DW0_ARYSPC_LOD0;
-      else
-         dw[0] |= GEN7_SURFACE_DW0_ARYSPC_FULL;
-   }
-
-   if (is_rt)
-      dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt)
-      dw[0] |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      assert(img->walk_layer_height % 4 == 0);
-      dw[1] = img->walk_layer_height / 4;
-   } else {
-      dw[1] = 0;
-   }
-
-   dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width - 1, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH) |
-           (pitch - 1);
-
-   dw[4] = first_layer << 18 |
-           (num_layers - 1) << 7;
-
-   /*
-    * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL
-    * means the samples are interleaved.  The layouts are the same when the
-    * number of samples is 1.
-    */
-   if (img->interleaved_samples && img->sample_count > 1) {
-      assert(!is_rt);
-      dw[4] |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL;
-   }
-   else {
-      dw[4] |= GEN7_SURFACE_DW4_MSFMT_MSS;
-   }
-
-   switch (img->sample_count) {
-   case 0:
-   case 1:
-   default:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_1;
-      break;
-   case 2:
-      dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2;
-      break;
-   case 4:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4;
-      break;
-   case 8:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8;
-      break;
-   case 16:
-      dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16;
-      break;
-   }
-
-   dw[5] = GEN_SHIFT32(first_level, GEN7_SURFACE_DW5_MIN_LOD) |
-           lod;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      dw[7] |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
-               GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
-               GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
-               GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      memset(&dw[8], 0, sizeof(*dw) * (13 - 8));
-}
-
-void
-ilo_gpe_init_view_surface_null(const struct ilo_dev *dev,
-                               unsigned width, unsigned height,
-                               unsigned depth, unsigned level,
-                               struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_null_gen7(dev,
-            width, height, depth, level, surf);
-   } else {
-      view_init_null_gen6(dev,
-            width, height, depth, level, surf);
-   }
-
-   surf->bo = NULL;
-   surf->scanout = false;
-}
-
-void
-ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev,
-                                     const struct ilo_buffer *buf,
-                                     unsigned offset, unsigned size,
-                                     unsigned struct_size,
-                                     enum pipe_format elem_format,
-                                     bool is_rt, bool render_cache_rw,
-                                     struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_for_buffer_gen7(dev, buf, offset, size,
-            struct_size, elem_format, is_rt, render_cache_rw, surf);
-   } else {
-      view_init_for_buffer_gen6(dev, buf, offset, size,
-            struct_size, elem_format, is_rt, render_cache_rw, surf);
-   }
-
-   /* do not increment reference count */
-   surf->bo = buf->bo;
-   surf->scanout = false;
-}
-
-void
-ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev,
-                                    const struct ilo_image *img,
-                                    enum pipe_texture_target target,
-                                    enum pipe_format format,
-                                    unsigned first_level,
-                                    unsigned num_levels,
-                                    unsigned first_layer,
-                                    unsigned num_layers,
-                                    bool is_rt,
-                                    struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_for_image_gen7(dev, img, target, format,
-            first_level, num_levels, first_layer, num_layers,
-            is_rt, surf);
-   } else {
-      view_init_for_image_gen6(dev, img, target, format,
-            first_level, num_levels, first_layer, num_layers,
-            is_rt, surf);
-   }
-
-   surf->scanout = img->scanout;
-   /* do not increment reference count */
-   surf->bo = img->bo;
-}
-
-static void
-sampler_init_border_color_gen6(const struct ilo_dev *dev,
-                               const union pipe_color_union *color,
-                               uint32_t *dw, int num_dwords)
-{
-   float rgba[4] = {
-      color->f[0], color->f[1], color->f[2], color->f[3],
-   };
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   assert(num_dwords >= 12);
-
-   /*
-    * This state is not documented in the Sandy Bridge PRM, but in the
-    * Ironlake PRM.  SNORM8 seems to be in DW11 instead of DW1.
-    */
-
-   /* IEEE_FP */
-   dw[1] = fui(rgba[0]);
-   dw[2] = fui(rgba[1]);
-   dw[3] = fui(rgba[2]);
-   dw[4] = fui(rgba[3]);
-
-   /* FLOAT_16 */
-   dw[5] = util_float_to_half(rgba[0]) |
-           util_float_to_half(rgba[1]) << 16;
-   dw[6] = util_float_to_half(rgba[2]) |
-           util_float_to_half(rgba[3]) << 16;
-
-   /* clamp to [-1.0f, 1.0f] */
-   rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f);
-   rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f);
-   rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f);
-   rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f);
-
-   /* SNORM16 */
-   dw[9] =  (int16_t) util_iround(rgba[0] * 32767.0f) |
-            (int16_t) util_iround(rgba[1] * 32767.0f) << 16;
-   dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) |
-            (int16_t) util_iround(rgba[3] * 32767.0f) << 16;
-
-   /* SNORM8 */
-   dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) |
-            (int8_t) util_iround(rgba[1] * 127.0f) << 8 |
-            (int8_t) util_iround(rgba[2] * 127.0f) << 16 |
-            (int8_t) util_iround(rgba[3] * 127.0f) << 24;
-
-   /* clamp to [0.0f, 1.0f] */
-   rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f);
-   rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f);
-   rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f);
-   rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f);
-
-   /* UNORM8 */
-   dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) |
-           (uint8_t) util_iround(rgba[1] * 255.0f) << 8 |
-           (uint8_t) util_iround(rgba[2] * 255.0f) << 16 |
-           (uint8_t) util_iround(rgba[3] * 255.0f) << 24;
-
-   /* UNORM16 */
-   dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) |
-           (uint16_t) util_iround(rgba[1] * 65535.0f) << 16;
-   dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) |
-           (uint16_t) util_iround(rgba[3] * 65535.0f) << 16;
-}
-
-/**
- * Translate a pipe texture mipfilter to the matching hardware mipfilter.
- */
-static int
-gen6_translate_tex_mipfilter(unsigned filter)
-{
-   switch (filter) {
-   case PIPE_TEX_MIPFILTER_NEAREST: return GEN6_MIPFILTER_NEAREST;
-   case PIPE_TEX_MIPFILTER_LINEAR:  return GEN6_MIPFILTER_LINEAR;
-   case PIPE_TEX_MIPFILTER_NONE:    return GEN6_MIPFILTER_NONE;
-   default:
-      assert(!"unknown mipfilter");
-      return GEN6_MIPFILTER_NONE;
-   }
-}
-
-/**
- * Translate a pipe texture filter to the matching hardware mapfilter.
- */
-static int
-gen6_translate_tex_filter(unsigned filter)
-{
-   switch (filter) {
-   case PIPE_TEX_FILTER_NEAREST: return GEN6_MAPFILTER_NEAREST;
-   case PIPE_TEX_FILTER_LINEAR:  return GEN6_MAPFILTER_LINEAR;
-   default:
-      assert(!"unknown sampler filter");
-      return GEN6_MAPFILTER_NEAREST;
-   }
-}
-
-/**
- * Translate a pipe texture coordinate wrapping mode to the matching hardware
- * wrapping mode.
- */
-static int
-gen6_translate_tex_wrap(unsigned wrap)
-{
-   switch (wrap) {
-   case PIPE_TEX_WRAP_CLAMP:              return GEN8_TEXCOORDMODE_HALF_BORDER;
-   case PIPE_TEX_WRAP_REPEAT:             return GEN6_TEXCOORDMODE_WRAP;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:      return GEN6_TEXCOORDMODE_CLAMP;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:    return GEN6_TEXCOORDMODE_CLAMP_BORDER;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:      return GEN6_TEXCOORDMODE_MIRROR;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-   default:
-      assert(!"unknown sampler wrap mode");
-      return GEN6_TEXCOORDMODE_WRAP;
-   }
-}
-
-/**
- * Translate a pipe shadow compare function to the matching hardware shadow
- * function.
- */
-static int
-gen6_translate_shadow_func(unsigned func)
-{
-   /*
-    * For PIPE_FUNC_x, the reference value is on the left-hand side of the
-    * comparison, and 1.0 is returned when the comparison is true.
-    *
-    * For GEN6_COMPAREFUNCTION_x, the reference value is on the right-hand side of
-    * the comparison, and 0.0 is returned when the comparison is true.
-    */
-   switch (func) {
-   case PIPE_FUNC_NEVER:      return GEN6_COMPAREFUNCTION_ALWAYS;
-   case PIPE_FUNC_LESS:       return GEN6_COMPAREFUNCTION_LEQUAL;
-   case PIPE_FUNC_EQUAL:      return GEN6_COMPAREFUNCTION_NOTEQUAL;
-   case PIPE_FUNC_LEQUAL:     return GEN6_COMPAREFUNCTION_LESS;
-   case PIPE_FUNC_GREATER:    return GEN6_COMPAREFUNCTION_GEQUAL;
-   case PIPE_FUNC_NOTEQUAL:   return GEN6_COMPAREFUNCTION_EQUAL;
-   case PIPE_FUNC_GEQUAL:     return GEN6_COMPAREFUNCTION_GREATER;
-   case PIPE_FUNC_ALWAYS:     return GEN6_COMPAREFUNCTION_NEVER;
-   default:
-      assert(!"unknown shadow compare function");
-      return GEN6_COMPAREFUNCTION_NEVER;
-   }
-}
-
-void
-ilo_gpe_init_sampler_cso(const struct ilo_dev *dev,
-                         const struct pipe_sampler_state *state,
-                         struct ilo_sampler_cso *sampler)
-{
-   int mip_filter, min_filter, mag_filter, max_aniso;
-   int lod_bias, max_lod, min_lod;
-   int wrap_s, wrap_t, wrap_r, wrap_cube;
-   uint32_t dw0, dw1, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(sampler, 0, sizeof(*sampler));
-
-   mip_filter = gen6_translate_tex_mipfilter(state->min_mip_filter);
-   min_filter = gen6_translate_tex_filter(state->min_img_filter);
-   mag_filter = gen6_translate_tex_filter(state->mag_img_filter);
-
-   sampler->anisotropic = state->max_anisotropy;
-
-   if (state->max_anisotropy >= 2 && state->max_anisotropy <= 16)
-      max_aniso = state->max_anisotropy / 2 - 1;
-   else if (state->max_anisotropy > 16)
-      max_aniso = GEN6_ANISORATIO_16;
-   else
-      max_aniso = GEN6_ANISORATIO_2;
-
-   /*
-    *
-    * Here is how the hardware calculate per-pixel LOD, from my reading of the
-    * PRMs:
-    *
-    *  1) LOD is set to log2(ratio of texels to pixels) if not specified in
-    *     other ways.  The number of texels is measured using level
-    *     SurfMinLod.
-    *  2) Bias is added to LOD.
-    *  3) LOD is clamped to [MinLod, MaxLod], and the clamped value is
-    *     compared with Base to determine whether magnification or
-    *     minification is needed.  (if preclamp is disabled, LOD is compared
-    *     with Base before clamping)
-    *  4) If magnification is needed, or no mipmapping is requested, LOD is
-    *     set to floor(MinLod).
-    *  5) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD.
-    *
-    * With Gallium interface, Base is always zero and
-    * pipe_sampler_view::u.tex.first_level specifies SurfMinLod.
-    */
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      const float scale = 256.0f;
-
-      /* [-16.0, 16.0) in S4.8 */
-      lod_bias = (int)
-         (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale);
-      lod_bias &= 0x1fff;
-
-      /* [0.0, 14.0] in U4.8 */
-      max_lod = (int) (CLAMP(state->max_lod, 0.0f, 14.0f) * scale);
-      min_lod = (int) (CLAMP(state->min_lod, 0.0f, 14.0f) * scale);
-   }
-   else {
-      const float scale = 64.0f;
-
-      /* [-16.0, 16.0) in S4.6 */
-      lod_bias = (int)
-         (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale);
-      lod_bias &= 0x7ff;
-
-      /* [0.0, 13.0] in U4.6 */
-      max_lod = (int) (CLAMP(state->max_lod, 0.0f, 13.0f) * scale);
-      min_lod = (int) (CLAMP(state->min_lod, 0.0f, 13.0f) * scale);
-   }
-
-   /*
-    * We want LOD to be clamped to determine magnification/minification, and
-    * get set to zero when it is magnification or when mipmapping is disabled.
-    * The hardware would set LOD to floor(MinLod) and that is a problem when
-    * MinLod is greater than or equal to 1.0f.
-    *
-    * With Base being zero, it is always minification when MinLod is non-zero.
-    * To achieve our goal, we just need to set MinLod to zero and set
-    * MagFilter to MinFilter when mipmapping is disabled.
-    */
-   if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && min_lod) {
-      min_lod = 0;
-      mag_filter = min_filter;
-   }
-
-   /* determine wrap s/t/r */
-   wrap_s = gen6_translate_tex_wrap(state->wrap_s);
-   wrap_t = gen6_translate_tex_wrap(state->wrap_t);
-   wrap_r = gen6_translate_tex_wrap(state->wrap_r);
-   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
-      /*
-       * For nearest filtering, PIPE_TEX_WRAP_CLAMP means
-       * PIPE_TEX_WRAP_CLAMP_TO_EDGE;  for linear filtering,
-       * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while
-       * additionally clamping the texture coordinates to [0.0, 1.0].
-       *
-       * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8.  The
-       * clamping has to be taken care of in the shaders.  There are two
-       * filters here, but let the minification one has a say.
-       */
-      const bool clamp_is_to_edge =
-         (state->min_img_filter == PIPE_TEX_FILTER_NEAREST);
-
-      if (clamp_is_to_edge) {
-         if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_s = GEN6_TEXCOORDMODE_CLAMP;
-         if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_t = GEN6_TEXCOORDMODE_CLAMP;
-         if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_r = GEN6_TEXCOORDMODE_CLAMP;
-      } else {
-         if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_s = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_s = true;
-         }
-         if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_t = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_t = true;
-         }
-         if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_r = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_r = true;
-         }
-      }
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 107:
-    *
-    *     "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP
-    *      and TEXCOORDMODE_CUBE settings are valid, and each TC component
-    *      must have the same Address Control mode."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 96:
-    *
-    *     "This field (Cube Surface Control Mode) must be set to
-    *      CUBECTRLMODE_PROGRAMMED"
-    *
-    * Therefore, we cannot use "Cube Surface Control Mode" for semless cube
-    * map filtering.
-    */
-   if (state->seamless_cube_map &&
-       (state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
-        state->mag_img_filter != PIPE_TEX_FILTER_NEAREST)) {
-      wrap_cube = GEN6_TEXCOORDMODE_CUBE;
-   }
-   else {
-      wrap_cube = GEN6_TEXCOORDMODE_CLAMP;
-   }
-
-   if (!state->normalized_coords) {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 98:
-       *
-       *     "The following state must be set as indicated if this field
-       *      (Non-normalized Coordinate Enable) is enabled:
-       *
-       *      - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP,
-       *        TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER.
-       *      - Surface Type must be SURFTYPE_2D or SURFTYPE_3D.
-       *      - Mag Mode Filter must be MAPFILTER_NEAREST or
-       *        MAPFILTER_LINEAR.
-       *      - Min Mode Filter must be MAPFILTER_NEAREST or
-       *        MAPFILTER_LINEAR.
-       *      - Mip Mode Filter must be MIPFILTER_NONE.
-       *      - Min LOD must be 0.
-       *      - Max LOD must be 0.
-       *      - MIP Count must be 0.
-       *      - Surface Min LOD must be 0.
-       *      - Texture LOD Bias must be 0."
-       */
-      assert(wrap_s == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_s == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-      assert(wrap_t == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_t == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-      assert(wrap_r == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_r == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-
-      assert(mag_filter == GEN6_MAPFILTER_NEAREST ||
-             mag_filter == GEN6_MAPFILTER_LINEAR);
-      assert(min_filter == GEN6_MAPFILTER_NEAREST ||
-             min_filter == GEN6_MAPFILTER_LINEAR);
-
-      /* work around a bug in util_blitter */
-      mip_filter = GEN6_MIPFILTER_NONE;
-
-      assert(mip_filter == GEN6_MIPFILTER_NONE);
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      dw0 = 1 << 28 |
-            mip_filter << 20 |
-            lod_bias << 1;
-
-      sampler->dw_filter = mag_filter << 17 |
-                           min_filter << 14;
-
-      sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 |
-                                 GEN6_MAPFILTER_ANISOTROPIC << 14 |
-                                 1;
-
-      dw1 = min_lod << 20 |
-            max_lod << 8;
-
-      if (state->compare_mode != PIPE_TEX_COMPARE_NONE)
-         dw1 |= gen6_translate_shadow_func(state->compare_func) << 1;
-
-      dw3 = max_aniso << 19;
-
-      /* round the coordinates for linear filtering */
-      if (min_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_V_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_R_MIN_ROUND);
-      }
-      if (mag_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_V_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_R_MAG_ROUND);
-      }
-
-      if (!state->normalized_coords)
-         dw3 |= 1 << 10;
-
-      sampler->dw_wrap = wrap_s << 6 |
-                         wrap_t << 3 |
-                         wrap_r;
-
-      /*
-       * As noted in the classic i965 driver, the HW may still reference
-       * wrap_t and wrap_r for 1D textures.  We need to set them to a safe
-       * mode
-       */
-      sampler->dw_wrap_1d = wrap_s << 6 |
-                            GEN6_TEXCOORDMODE_WRAP << 3 |
-                            GEN6_TEXCOORDMODE_WRAP;
-
-      sampler->dw_wrap_cube = wrap_cube << 6 |
-                              wrap_cube << 3 |
-                              wrap_cube;
-
-      STATIC_ASSERT(Elements(sampler->payload) >= 7);
-
-      sampler->payload[0] = dw0;
-      sampler->payload[1] = dw1;
-      sampler->payload[2] = dw3;
-
-      memcpy(&sampler->payload[3],
-            state->border_color.ui, sizeof(state->border_color.ui));
-   }
-   else {
-      dw0 = 1 << 28 |
-            mip_filter << 20 |
-            lod_bias << 3;
-
-      if (state->compare_mode != PIPE_TEX_COMPARE_NONE)
-         dw0 |= gen6_translate_shadow_func(state->compare_func);
-
-      sampler->dw_filter = (min_filter != mag_filter) << 27 |
-                           mag_filter << 17 |
-                           min_filter << 14;
-
-      sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 |
-                                 GEN6_MAPFILTER_ANISOTROPIC << 14;
-
-      dw1 = min_lod << 22 |
-            max_lod << 12;
-
-      sampler->dw_wrap = wrap_s << 6 |
-                         wrap_t << 3 |
-                         wrap_r;
-
-      sampler->dw_wrap_1d = wrap_s << 6 |
-                            GEN6_TEXCOORDMODE_WRAP << 3 |
-                            GEN6_TEXCOORDMODE_WRAP;
-
-      sampler->dw_wrap_cube = wrap_cube << 6 |
-                              wrap_cube << 3 |
-                              wrap_cube;
-
-      dw3 = max_aniso << 19;
-
-      /* round the coordinates for linear filtering */
-      if (min_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_V_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_R_MIN_ROUND);
-      }
-      if (mag_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_V_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_R_MAG_ROUND);
-      }
-
-      if (!state->normalized_coords)
-         dw3 |= 1;
-
-      STATIC_ASSERT(Elements(sampler->payload) >= 15);
-
-      sampler->payload[0] = dw0;
-      sampler->payload[1] = dw1;
-      sampler->payload[2] = dw3;
-
-      sampler_init_border_color_gen6(dev,
-            &state->border_color, &sampler->payload[3], 12);
-   }
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c
new file mode 100644
index 00000000000..83ee8de979c
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c
@@ -0,0 +1,890 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_cc.h"
+
+static bool
+cc_validate_gen6_stencil(const struct ilo_dev *dev,
+                         const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 359:
+    *
+    *     "If the Depth Buffer is either undefined or does not have a surface
+    *      format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate
+    *      stencil buffer is disabled, Stencil Test Enable must be DISABLED"
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 370:
+    *
+    *     "This field (Stencil Test Enable) cannot be enabled if Surface
+    *      Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM."
+    */
+   if (stencil->test_enable)
+      assert(stencil->cv_has_buffer);
+
+   return true;
+}
+
+static bool
+cc_validate_gen6_depth(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 360:
+    *
+    *     "Enabling the Depth Test function without defining a Depth Buffer is
+    *      UNDEFINED."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 375:
+    *
+    *     "A Depth Buffer must be defined before enabling writes to it, or
+    *      operation is UNDEFINED."
+    */
+   if (depth->test_enable || depth->write_enable)
+      assert(depth->cv_has_buffer);
+
+   return true;
+}
+
+static bool
+cc_set_gen6_DEPTH_STENCIL_STATE(struct ilo_state_cc *cc,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+   const struct ilo_state_cc_params_info *params = &info->params;
+   uint32_t dw0, dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!cc_validate_gen6_stencil(dev, info) ||
+       !cc_validate_gen6_depth(dev, info))
+      return false;
+
+   dw0 = 0;
+   dw1 = 0;
+   if (stencil->test_enable) {
+      const struct ilo_state_cc_stencil_op_info *front = &stencil->front;
+      const struct ilo_state_cc_stencil_params_info *front_p =
+         &params->stencil_front;
+      const struct ilo_state_cc_stencil_op_info *back;
+      const struct ilo_state_cc_stencil_params_info *back_p;
+
+      dw0 |= GEN6_ZS_DW0_STENCIL_TEST_ENABLE;
+
+      if (stencil->twosided_enable) {
+         dw0 |= GEN6_ZS_DW0_STENCIL1_ENABLE;
+
+         back = &stencil->back;
+         back_p = &params->stencil_back;
+      } else {
+         back = &stencil->front;
+         back_p = &params->stencil_front;
+      }
+
+      dw0 |= front->test_func << GEN6_ZS_DW0_STENCIL_FUNC__SHIFT |
+             front->fail_op << GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT |
+             front->zfail_op << GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT |
+             front->zpass_op << GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT |
+             back->test_func << GEN6_ZS_DW0_STENCIL1_FUNC__SHIFT |
+             back->fail_op << GEN6_ZS_DW0_STENCIL1_FAIL_OP__SHIFT |
+             back->zfail_op << GEN6_ZS_DW0_STENCIL1_ZFAIL_OP__SHIFT |
+             back->zpass_op << GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT;
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 363:
+       *
+       *     "If this field (Stencil Buffer Write Enable) is enabled, Stencil
+       *      Test Enable must also be enabled."
+       *
+       * This is different from depth write enable, which is independent from
+       * depth test enable.
+       */
+      if (front_p->write_mask || back_p->write_mask)
+         dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+
+      dw1 |= front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT |
+             front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT |
+             back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT |
+             back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT;
+   }
+
+   dw2 = 0;
+   if (depth->test_enable) {
+      dw2 |= GEN6_ZS_DW2_DEPTH_TEST_ENABLE |
+             depth->test_func << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT;
+   } else {
+      dw2 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT;
+   }
+
+   /* independent from depth->test_enable */
+   if (depth->write_enable)
+      dw2 |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 3);
+   cc->ds[0] = dw0;
+   cc->ds[1] = dw1;
+   cc->ds[2] = dw2;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_state_cc *cc,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+   const struct ilo_state_cc_params_info *params = &info->params;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!cc_validate_gen6_stencil(dev, info) ||
+       !cc_validate_gen6_depth(dev, info))
+      return false;
+
+   dw1 = 0;
+   dw2 = 0;
+   if (stencil->test_enable) {
+      const struct ilo_state_cc_stencil_op_info *front = &stencil->front;
+      const struct ilo_state_cc_stencil_params_info *front_p =
+         &params->stencil_front;
+      const struct ilo_state_cc_stencil_op_info *back;
+      const struct ilo_state_cc_stencil_params_info *back_p;
+
+      dw1 |= GEN8_ZS_DW1_STENCIL_TEST_ENABLE;
+
+      if (stencil->twosided_enable) {
+         dw1 |= GEN8_ZS_DW1_STENCIL1_ENABLE;
+
+         back = &stencil->back;
+         back_p = &params->stencil_back;
+      } else {
+         back = &stencil->front;
+         back_p = &params->stencil_front;
+      }
+
+      dw1 |= front->fail_op << GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT |
+             front->zfail_op << GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT |
+             front->zpass_op << GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT |
+             back->test_func << GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT |
+             back->fail_op << GEN8_ZS_DW1_STENCIL1_FAIL_OP__SHIFT |
+             back->zfail_op << GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT |
+             back->zpass_op << GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT |
+             front->test_func << GEN8_ZS_DW1_STENCIL_FUNC__SHIFT;
+
+      if (front_p->write_mask || back_p->write_mask)
+         dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+
+      dw2 |= front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT |
+             front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT |
+             back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT |
+             back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT;
+   }
+
+   if (depth->test_enable) {
+      dw1 |= GEN8_ZS_DW1_DEPTH_TEST_ENABLE |
+             depth->test_func << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT;
+   } else {
+      dw1 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT;
+   }
+
+   if (depth->write_enable)
+      dw1 |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 2);
+   cc->ds[0] = dw1;
+   cc->ds[1] = dw2;
+
+   return true;
+}
+
+static bool
+is_dual_source_blend_factor(enum gen_blend_factor factor)
+{
+   switch (factor) {
+   case GEN6_BLENDFACTOR_SRC1_COLOR:
+   case GEN6_BLENDFACTOR_SRC1_ALPHA:
+   case GEN6_BLENDFACTOR_INV_SRC1_COLOR:
+   case GEN6_BLENDFACTOR_INV_SRC1_ALPHA:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+cc_get_gen6_dual_source_blending(const struct ilo_dev *dev,
+                                 const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   bool dual_source_blending;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dual_source_blending = (blend->rt_count &&
+         (is_dual_source_blend_factor(blend->rt[0].rgb_src) ||
+          is_dual_source_blend_factor(blend->rt[0].rgb_dst) ||
+          is_dual_source_blend_factor(blend->rt[0].a_src) ||
+          is_dual_source_blend_factor(blend->rt[0].a_dst)));
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "Dual Source Blending: When using "Dual Source" Render Target
+    *      Write messages, the Source1 pixel color+alpha passed in the
+    *      message can be selected as a src/dst blend factor. See Color
+    *      Buffer Blending.  In single-source mode, those blend factor
+    *      selections are invalid. If SRC1 is included in a src/dst blend
+    *      factor and a DualSource RT Write message is not utilized,
+    *      results are UNDEFINED. (This reflects the same restriction in DX
+    *      APIs, where undefined results are produced if "o1" is not
+    *      written by a PS - there are no default values defined). If SRC1
+    *      is not included in a src/dst blend factor, dual source blending
+    *      must be disabled."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 356:
+    *
+    *     "The single source message will not cause a write to the render
+    *      target if Dual Source Blend Enable in 3DSTATE_WM is enabled."
+    *
+    *     "The dual source message will revert to a single source message
+    *      using source 0 if Dual Source Blend Enable in 3DSTATE_WM is
+    *      disabled."
+    *
+    * Dual source blending must be enabled or disabled universally.
+    */
+   for (i = 1; i < blend->rt_count; i++) {
+      assert(dual_source_blending ==
+         (is_dual_source_blend_factor(blend->rt[i].rgb_src) ||
+          is_dual_source_blend_factor(blend->rt[i].rgb_dst) ||
+          is_dual_source_blend_factor(blend->rt[i].a_src) ||
+          is_dual_source_blend_factor(blend->rt[i].a_dst)));
+   }
+
+   return dual_source_blending;
+}
+
+static bool
+cc_validate_gen6_alpha(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "Alpha values from the pixel shader are treated as FLOAT32 format
+    *      for computing the AlphaToCoverage Mask."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
+    *
+    *     "If set (AlphaToCoverage Enable), Source0 Alpha is converted to a
+    *      temporary 1/2/4-bit coverage mask and the mask bit corresponding to
+    *      the sample# ANDed with the sample mask bit. If set, sample coverage
+    *      is computed based on src0 alpha value. Value of 0 disables all
+    *      samples and value of 1 enables all samples for that pixel. The same
+    *      coverage needs to apply to all the RTs in MRT case. Further, any
+    *      value of src0 alpha between 0 and 1 monotonically increases the
+    *      number of enabled pixels.
+    *
+    *      The same coverage needs to be applied to all the RTs in MRT case."
+    *
+    *     "If set (AlphaToOne Enable), Source0 Alpha is set to 1.0f after
+    *      (possibly) being used to generate the AlphaToCoverage coverage
+    *      mask.
+    *
+    *      The same coverage needs to be applied to all the RTs in MRT case.
+    *
+    *      If Dual Source Blending is enabled, this bit must be disabled."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
+    *
+    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
+    *      alpha value.
+    *
+    *      Alpha Test is applied independently on each render target by
+    *      comparing that render target's alpha value against the alpha
+    *      reference value. If the alpha test fails, the corresponding pixel
+    *      write will be supressed only for that render target. The
+    *      depth/stencil update will occur if alpha test passes for any render
+    *      target."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 194:
+    *
+    *     "Multiple render targets are supported with the single source and
+    *      replicate data messages. Each render target is accessed with a
+    *      separate Render Target Write message, each with a different surface
+    *      indicated (different binding table index). The depth buffer is
+    *      written only by the message(s) to the last render target, indicated
+    *      by the Last Render Target Select bit set to clear the pixel
+    *      scoreboard bits."
+    *
+    * When AlphaToCoverage/AlphaToOne/AlphaTest is enabled, it is
+    * required/desirable for the RT write messages to set "Source0 Alpha
+    * Present to RenderTarget" in the MRT case.  It is also required/desirable
+    * for the alpha values to be FLOAT32.
+    */
+   if (alpha->alpha_to_coverage || alpha->alpha_to_one || alpha->test_enable)
+      assert(alpha->cv_float_source0_alpha);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "[DevSNB]: When NumSamples = 1, AlphaToCoverage and AlphaTo
+    *      Coverage Dither both must be disabled."
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && alpha->alpha_to_coverage)
+      assert(alpha->cv_sample_count_one);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
+    *
+    *     "If Dual Source Blending is enabled, this bit (AlphaToOne Enable)
+    *      must be disabled."
+    */
+   if (alpha->alpha_to_one)
+      assert(!cc_get_gen6_dual_source_blending(dev, info));
+
+   return true;
+}
+
+static bool
+cc_validate_gen6_blend(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(blend->rt_count <= ILO_STATE_CC_BLEND_MAX_RT_COUNT);
+
+   return true;
+}
+
+static enum gen_blend_factor
+get_dst_alpha_one_blend_factor(enum gen_blend_factor factor, bool is_rgb)
+{
+   switch (factor) {
+   case GEN6_BLENDFACTOR_DST_ALPHA:
+      return GEN6_BLENDFACTOR_ONE;
+   case GEN6_BLENDFACTOR_INV_DST_ALPHA:
+      return GEN6_BLENDFACTOR_ZERO;
+   case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return (is_rgb) ? GEN6_BLENDFACTOR_ZERO : GEN6_BLENDFACTOR_ONE;
+   default:
+      return factor;
+   }
+}
+
+static void
+cc_get_gen6_effective_rt(const struct ilo_dev *dev,
+                         const struct ilo_state_cc_info *info,
+                         uint8_t rt_index,
+                         struct ilo_state_cc_blend_rt_info *dst)
+{
+   const struct ilo_state_cc_blend_rt_info *rt = &info->blend.rt[rt_index];
+
+   if (rt->logicop_enable || rt->blend_enable ||
+       rt->argb_write_disables != 0xf)
+      assert(rt->cv_has_buffer);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
+    *      variants), otherwise Logic Ops must be DISABLED."
+    *
+    * From the Broadwell PRM, volume 7, page 671:
+    *
+    *     "Logic Ops are supported on all blendable render targets and render
+    *      targets with *INT formats."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && rt->logicop_enable)
+      assert(rt->cv_is_unorm);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 361:
+    *
+    *     "Only certain surface formats support Color Buffer Blending.  Refer
+    *      to the Surface Format tables in Sampling Engine. Blending must be
+    *      disabled on a RenderTarget if blending is not supported."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Color Buffer Blending and Logic Ops must not be enabled
+    *      simultaneously, or behavior is UNDEFINED."
+    */
+   if (rt->blend_enable)
+      assert(!rt->cv_is_integer && !rt->logicop_enable);
+
+   *dst = *rt;
+   if (rt->blend_enable) {
+      /* 0x0 is reserved in enum gen_blend_factor */
+      assert(rt->rgb_src && rt->rgb_dst && rt->a_src && rt->a_dst);
+
+      if (rt->force_dst_alpha_one) {
+         dst->rgb_src = get_dst_alpha_one_blend_factor(rt->rgb_src, true);
+         dst->rgb_dst = get_dst_alpha_one_blend_factor(rt->rgb_dst, true);
+         dst->a_src = get_dst_alpha_one_blend_factor(rt->a_src, false);
+         dst->a_dst = get_dst_alpha_one_blend_factor(rt->a_dst, false);
+         dst->force_dst_alpha_one = false;
+      }
+   } else {
+      dst->rgb_src = GEN6_BLENDFACTOR_ONE;
+      dst->rgb_dst = GEN6_BLENDFACTOR_ZERO;
+      dst->rgb_func = GEN6_BLENDFUNCTION_ADD;
+      dst->a_src = dst->rgb_src;
+      dst->a_dst = dst->rgb_dst;
+      dst->a_func = dst->rgb_func;
+   }
+}
+
+static bool
+cc_set_gen6_BLEND_STATE(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw1_invariant;
+   uint32_t dw0, dw1;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!cc_validate_gen6_alpha(dev, info) ||
+       !cc_validate_gen6_blend(dev, info))
+      return false;
+
+   /*
+    * According to the Sandy Bridge PRM, volume 2 part 1, page 360, pre-blend
+    * and post-blend color clamps must be enabled in most cases.  For the
+    * other cases, they are either desirable or ignored.  We can enable them
+    * unconditionally.
+    */
+   dw1 = GEN6_RT_DW1_COLORCLAMP_RTFORMAT |
+         GEN6_RT_DW1_PRE_BLEND_CLAMP |
+         GEN6_RT_DW1_POST_BLEND_CLAMP;
+
+   if (alpha->alpha_to_coverage) {
+      dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 379:
+       *
+       *     "[DevSNB]: This bit (AlphaToCoverage Dither Enable) must be
+       *      disabled."
+       */
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER;
+   }
+
+   if (alpha->alpha_to_one)
+      dw1 |= GEN6_RT_DW1_ALPHA_TO_ONE;
+
+   if (alpha->test_enable) {
+      dw1 |= GEN6_RT_DW1_ALPHA_TEST_ENABLE |
+             alpha->test_func << GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT;
+   } else {
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 371:
+       *
+       *     "When Alpha Test is disabled, Alpha Test Function must be
+       *      COMPAREFUNCTION_ALWAYS."
+       */
+      dw1 |= GEN6_COMPAREFUNCTION_ALWAYS <<
+         GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT;
+   }
+
+   if (blend->dither_enable)
+      dw1 |= GEN6_RT_DW1_DITHER_ENABLE;
+
+   dw1_invariant = dw1;
+
+   for (i = 0; i < blend->rt_count; i++) {
+      struct ilo_state_cc_blend_rt_info rt;
+
+      cc_get_gen6_effective_rt(dev, info, i, &rt);
+
+      /* 0x0 is reserved for blend factors and we have to set them all */
+      dw0 = rt.a_func << GEN6_RT_DW0_ALPHA_FUNC__SHIFT |
+            rt.a_src << GEN6_RT_DW0_SRC_ALPHA_FACTOR__SHIFT |
+            rt.a_dst << GEN6_RT_DW0_DST_ALPHA_FACTOR__SHIFT |
+            rt.rgb_func << GEN6_RT_DW0_COLOR_FUNC__SHIFT |
+            rt.rgb_src << GEN6_RT_DW0_SRC_COLOR_FACTOR__SHIFT |
+            rt.rgb_dst << GEN6_RT_DW0_DST_COLOR_FACTOR__SHIFT;
+
+      if (rt.blend_enable) {
+         dw0 |= GEN6_RT_DW0_BLEND_ENABLE;
+
+         if (rt.a_src != rt.rgb_src ||
+             rt.a_dst != rt.rgb_dst ||
+             rt.a_func != rt.rgb_func)
+            dw0 |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE;
+      }
+
+      dw1 = dw1_invariant |
+            rt.argb_write_disables << GEN6_RT_DW1_WRITE_DISABLES__SHIFT;
+
+      if (rt.logicop_enable) {
+         dw1 |= GEN6_RT_DW1_LOGICOP_ENABLE |
+                rt.logicop_func << GEN6_RT_DW1_LOGICOP_FUNC__SHIFT;
+      }
+
+      dw_rt[2 * i + 0] = dw0;
+      dw_rt[2 * i + 1] = dw1;
+   }
+
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= ARRAY_SIZE(dw_rt));
+   memcpy(&cc->blend[0], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count);
+   cc->blend_state_count = info->blend.rt_count;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_BLEND_STATE(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw0, dw1;
+   bool indep_alpha_enable;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!cc_validate_gen6_alpha(dev, info) ||
+       !cc_validate_gen6_blend(dev, info))
+      return false;
+
+   indep_alpha_enable = false;
+   for (i = 0; i < blend->rt_count; i++) {
+      struct ilo_state_cc_blend_rt_info rt;
+
+      cc_get_gen6_effective_rt(dev, info, i, &rt);
+
+      dw0 = rt.rgb_src << GEN8_RT_DW0_SRC_COLOR_FACTOR__SHIFT |
+            rt.rgb_dst << GEN8_RT_DW0_DST_COLOR_FACTOR__SHIFT |
+            rt.rgb_func << GEN8_RT_DW0_COLOR_FUNC__SHIFT |
+            rt.a_src << GEN8_RT_DW0_SRC_ALPHA_FACTOR__SHIFT |
+            rt.a_dst << GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT |
+            rt.a_func << GEN8_RT_DW0_ALPHA_FUNC__SHIFT |
+            rt.argb_write_disables << GEN8_RT_DW0_WRITE_DISABLES__SHIFT;
+
+      if (rt.blend_enable) {
+         dw0 |= GEN8_RT_DW0_BLEND_ENABLE;
+
+         if (rt.a_src != rt.rgb_src ||
+             rt.a_dst != rt.rgb_dst ||
+             rt.a_func != rt.rgb_func)
+            indep_alpha_enable = true;
+      }
+
+      dw1 = GEN8_RT_DW1_COLORCLAMP_RTFORMAT |
+            GEN8_RT_DW1_PRE_BLEND_CLAMP |
+            GEN8_RT_DW1_POST_BLEND_CLAMP;
+
+      if (rt.logicop_enable) {
+         dw1 |= GEN8_RT_DW1_LOGICOP_ENABLE |
+                rt.logicop_func << GEN8_RT_DW1_LOGICOP_FUNC__SHIFT;
+      }
+
+      dw_rt[2 * i + 0] = dw0;
+      dw_rt[2 * i + 1] = dw1;
+   }
+
+   dw0 = 0;
+
+   if (alpha->alpha_to_coverage) {
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE |
+             GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER;
+   }
+
+   if (indep_alpha_enable)
+      dw0 |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
+
+   if (alpha->alpha_to_one)
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TO_ONE;
+
+   if (alpha->test_enable) {
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TEST_ENABLE |
+             alpha->test_func << GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT;
+   } else {
+      dw0 |= GEN6_COMPAREFUNCTION_ALWAYS <<
+         GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT;
+   }
+
+   if (blend->dither_enable)
+      dw0 |= GEN8_BLEND_DW0_DITHER_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 2 + ARRAY_SIZE(dw_rt));
+   cc->blend[1] = dw0;
+   memcpy(&cc->blend[2], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count);
+   cc->blend_state_count = info->blend.rt_count;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (alpha->alpha_to_coverage)
+      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE;
+
+   if (alpha->test_enable)
+      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE;
+
+   if (blend->rt_count) {
+      struct ilo_state_cc_blend_rt_info rt0;
+      uint8_t i;
+
+      cc_get_gen6_effective_rt(dev, info, 0, &rt0);
+
+      /* 0x0 is reserved for blend factors and we have to set them all */
+      dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT |
+             rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT |
+             rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT |
+             rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT;
+
+      for (i = 0; i < blend->rt_count; i++) {
+         if (blend->rt[i].argb_write_disables != 0xf) {
+            dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT;
+            break;
+         }
+      }
+
+      if (rt0.blend_enable) {
+         dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE;
+
+         if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst)
+            dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 1);
+   cc->blend[0] = dw1;
+
+   return true;
+}
+
+static bool
+cc_params_set_gen6_COLOR_CALC_STATE(struct ilo_state_cc *cc,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_cc_params_info *params)
+{
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dw0 = params->stencil_front.test_ref << GEN6_CC_DW0_STENCIL_REF__SHIFT |
+         params->stencil_back.test_ref << GEN6_CC_DW0_STENCIL1_REF__SHIFT |
+         GEN6_CC_DW0_ALPHATEST_FLOAT32;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->cc) >= 6);
+   cc->cc[0] = dw0;
+   cc->cc[1] = fui(params->alpha_ref);
+   cc->cc[2] = fui(params->blend_rgba[0]);
+   cc->cc[3] = fui(params->blend_rgba[1]);
+   cc->cc[4] = fui(params->blend_rgba[2]);
+   cc->cc[5] = fui(params->blend_rgba[3]);
+
+   return true;
+}
+
+bool
+ilo_state_cc_init(struct ilo_state_cc *cc,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_cc_info *info)
+{
+   assert(ilo_is_zeroed(cc, sizeof(*cc)));
+   return ilo_state_cc_set_info(cc, dev, info);
+}
+
+bool
+ilo_state_cc_set_info(struct ilo_state_cc *cc,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_cc_info *info)
+{
+   bool ret = true;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(cc, dev, info);
+      ret &= cc_set_gen8_BLEND_STATE(cc, dev, info);
+      ret &= cc_set_gen8_3DSTATE_PS_BLEND(cc, dev, info);
+   } else {
+      ret &= cc_set_gen6_DEPTH_STENCIL_STATE(cc, dev, info);
+      ret &= cc_set_gen6_BLEND_STATE(cc, dev, info);
+   }
+
+   ret &= cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, &info->params);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_cc_set_params(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_params_info *params)
+{
+   /* modify stencil masks */
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw1 = cc->ds[0];
+      uint32_t dw2 = cc->ds[1];
+
+      if (dw1 & GEN8_ZS_DW1_STENCIL_TEST_ENABLE) {
+         const bool twosided_enable = (dw1 & GEN8_ZS_DW1_STENCIL1_ENABLE);
+         const struct ilo_state_cc_stencil_params_info *front_p =
+            &params->stencil_front;
+         const struct ilo_state_cc_stencil_params_info *back_p =
+            (twosided_enable) ? &params->stencil_back :
+                                &params->stencil_front;
+
+         if (front_p->write_mask || back_p->write_mask)
+            dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+         else
+            dw1 &= ~GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+
+         dw2 =
+            front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT |
+            front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT |
+            back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT |
+            back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT;
+      }
+
+      cc->ds[0] = dw1;
+      cc->ds[1] = dw2;
+   } else {
+      uint32_t dw0 = cc->ds[0];
+      uint32_t dw1 = cc->ds[1];
+
+      if (dw0 & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) {
+         const bool twosided_enable = (dw0 & GEN6_ZS_DW0_STENCIL1_ENABLE);
+         const struct ilo_state_cc_stencil_params_info *front_p =
+            &params->stencil_front;
+         const struct ilo_state_cc_stencil_params_info *back_p =
+            (twosided_enable) ? &params->stencil_back :
+                                &params->stencil_front;
+
+         if (front_p->write_mask || back_p->write_mask)
+            dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+         else
+            dw0 &= ~GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+
+         dw1 =
+            front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT |
+            front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT |
+            back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT |
+            back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT;
+      }
+
+      cc->ds[0] = dw0;
+      cc->ds[1] = dw1;
+   }
+
+   /* modify COLOR_CALC_STATE */
+   cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, params);
+
+   return true;
+}
+
+void
+ilo_state_cc_full_delta(const struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_cc_delta *delta)
+{
+   delta->dirty = ILO_STATE_CC_BLEND_STATE |
+                  ILO_STATE_CC_COLOR_CALC_STATE;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL |
+                      ILO_STATE_CC_3DSTATE_PS_BLEND;
+   } else {
+      delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE;
+   }
+}
+
+void
+ilo_state_cc_get_delta(const struct ilo_state_cc *cc,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_cc *old,
+                       struct ilo_state_cc_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (memcmp(cc->ds, old->ds, sizeof(cc->ds))) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL;
+      else
+         delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      if (cc->blend[0] != old->blend[0])
+         delta->dirty |= ILO_STATE_CC_3DSTATE_PS_BLEND;
+
+      if (memcmp(&cc->blend[1], &old->blend[1],
+               sizeof(uint32_t) * (1 + 2 * cc->blend_state_count)))
+         delta->dirty |= ILO_STATE_CC_BLEND_STATE;
+   } else if (memcmp(cc->blend, old->blend,
+            sizeof(uint32_t) * 2 * cc->blend_state_count)) {
+      delta->dirty |= ILO_STATE_CC_BLEND_STATE;
+   }
+
+   if (memcmp(cc->cc, old->cc, sizeof(cc->cc)))
+      delta->dirty |= ILO_STATE_CC_COLOR_CALC_STATE;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.h b/src/gallium/drivers/ilo/core/ilo_state_cc.h
new file mode 100644
index 00000000000..5b96a60f988
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_cc.h
@@ -0,0 +1,199 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_CC_H
+#define ILO_STATE_CC_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 38:
+ *
+ *     "Render Target Index. Specifies the render target index that will be
+ *      used to select blend state from BLEND_STATE.
+ *      Format = U3"
+ */
+#define ILO_STATE_CC_BLEND_MAX_RT_COUNT 8
+
+enum ilo_state_cc_dirty_bits {
+   ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL           = (1 << 0),
+   ILO_STATE_CC_3DSTATE_PS_BLEND                   = (1 << 1),
+   ILO_STATE_CC_DEPTH_STENCIL_STATE                = (1 << 2),
+   ILO_STATE_CC_BLEND_STATE                        = (1 << 3),
+   ILO_STATE_CC_COLOR_CALC_STATE                   = (1 << 4),
+};
+
+/**
+ * AlphaCoverage and AlphaTest.
+ */
+struct ilo_state_cc_alpha_info {
+   bool cv_sample_count_one;
+   bool cv_float_source0_alpha;
+
+   bool alpha_to_coverage;
+   bool alpha_to_one;
+
+   bool test_enable;
+   enum gen_compare_function test_func;
+};
+
+struct ilo_state_cc_stencil_op_info {
+   enum gen_compare_function test_func;
+   enum gen_stencil_op fail_op;
+   enum gen_stencil_op zfail_op;
+   enum gen_stencil_op zpass_op;
+};
+
+/**
+ * StencilTest.
+ */
+struct ilo_state_cc_stencil_info {
+   bool cv_has_buffer;
+
+   bool test_enable;
+   bool twosided_enable;
+
+   struct ilo_state_cc_stencil_op_info front;
+   struct ilo_state_cc_stencil_op_info back;
+};
+
+/**
+ * DepthTest.
+ */
+struct ilo_state_cc_depth_info {
+   bool cv_has_buffer;
+
+   bool test_enable;
+   /* independent from test_enable */
+   bool write_enable;
+
+   enum gen_compare_function test_func;
+};
+
+struct ilo_state_cc_blend_rt_info {
+   bool cv_has_buffer;
+   bool cv_is_unorm;
+   bool cv_is_integer;
+
+   uint8_t argb_write_disables;
+
+   bool logicop_enable;
+   enum gen_logic_op logicop_func;
+
+   bool blend_enable;
+   bool force_dst_alpha_one;
+   enum gen_blend_factor rgb_src;
+   enum gen_blend_factor rgb_dst;
+   enum gen_blend_function rgb_func;
+   enum gen_blend_factor a_src;
+   enum gen_blend_factor a_dst;
+   enum gen_blend_function a_func;
+};
+
+/**
+ * ColorBufferBlending, Dithering, and LogicOps.
+ */
+struct ilo_state_cc_blend_info {
+   const struct ilo_state_cc_blend_rt_info *rt;
+   uint8_t rt_count;
+
+   bool dither_enable;
+};
+
+struct ilo_state_cc_stencil_params_info {
+   uint8_t test_ref;
+   uint8_t test_mask;
+   uint8_t write_mask;
+};
+
+/**
+ * CC parameters.
+ */
+struct ilo_state_cc_params_info {
+   float alpha_ref;
+
+   struct ilo_state_cc_stencil_params_info stencil_front;
+   struct ilo_state_cc_stencil_params_info stencil_back;
+
+   float blend_rgba[4];
+};
+
+/**
+ * Pixel processing.
+ */
+struct ilo_state_cc_info {
+   struct ilo_state_cc_alpha_info alpha;
+   struct ilo_state_cc_stencil_info stencil;
+   struct ilo_state_cc_depth_info depth;
+   struct ilo_state_cc_blend_info blend;
+
+   struct ilo_state_cc_params_info params;
+};
+
+struct ilo_state_cc {
+   uint32_t ds[3];
+
+   uint8_t blend_state_count;
+   uint32_t blend[1 + 1 + 2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT];
+
+   uint32_t cc[6];
+};
+
+struct ilo_state_cc_delta {
+   uint32_t dirty;
+};
+
+bool
+ilo_state_cc_init(struct ilo_state_cc *cc,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_cc_info *info);
+
+bool
+ilo_state_cc_set_info(struct ilo_state_cc *cc,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_cc_info *info);
+
+bool
+ilo_state_cc_set_params(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_params_info *params);
+
+void
+ilo_state_cc_full_delta(const struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_cc_delta *delta);
+
+void
+ilo_state_cc_get_delta(const struct ilo_state_cc *cc,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_cc *old,
+                       struct ilo_state_cc_delta *delta);
+
+#endif /* ILO_STATE_CC_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.c b/src/gallium/drivers/ilo/core/ilo_state_compute.c
new file mode 100644
index 00000000000..a5fe5e1a6b0
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.c
@@ -0,0 +1,435 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_compute.h"
+
+struct compute_urb_configuration {
+   int idrt_entry_count;
+   int curbe_entry_count;
+
+   int urb_entry_count;
+   /* in 256-bit register increments */
+   int urb_entry_size;
+};
+
+static int
+get_gen6_rob_entry_count(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 60:
+    *
+    *     "ROB has 64KB of storage; 2048 entries."
+    *
+    * From the valid ranges of "CURBE Allocation Size", we can also conclude
+    * that interface entries and CURBE data must be in ROB.  And that ROB
+    * should be 16KB, or 512 entries, on Gen7 GT1.
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      return 2048;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      return (dev->gt == 2) ? 2048 : 512;
+   else
+      return (dev->gt == 2) ? 2048 : 1024;
+}
+
+static int
+get_gen6_idrt_entry_count(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 21:
+    *
+    *     "The first 32 URB entries are reserved for the interface
+    *      descriptor..."
+    *
+    * From the Haswell PRM, volume 7, page 836:
+    *
+    *     "The first 64 URB entries are reserved for the interface
+    *      description..."
+    */
+   return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
+}
+
+static int
+get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 21:
+    *
+    *     "(CURBE Allocation Size) Specifies the total length allocated for
+    *      CURBE, in 256-bit register increments.
+    */
+   const int entry_count = (curbe_size + 31) / 32;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(get_gen6_idrt_entry_count(dev) + entry_count <=
+         get_gen6_rob_entry_count(dev));
+
+   return entry_count;
+}
+
+static bool
+compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
+                                   const struct ilo_state_compute_info *info,
+                                   struct compute_urb_configuration *urb)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
+   urb->curbe_entry_count =
+      get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 451:
+    *
+    *     "Please note that 0 is not allowed for this field (Number of URB
+    *      Entries)."
+    */
+   urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 52:
+    *
+    *     "(URB Entry Allocation Size) Specifies the length of each URB entry
+    *      used by the unit, in 256-bit register increments - 1."
+    */
+   urb->urb_entry_size = 1;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 22:
+    *
+    *      MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
+    *      size and the number of URB handles. The driver must ensure that
+    *      ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
+    *      URB_allocation_in_L3."
+    */
+   assert(urb->idrt_entry_count + urb->curbe_entry_count +
+         urb->urb_entry_count * urb->urb_entry_size <=
+         info->cv_urb_alloc_size / 32);
+
+   return true;
+}
+
+static int
+compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
+                                    const struct ilo_state_compute_interface_info *interface)
+{
+   const int per_thread_read = (interface->curbe_read_length + 31) / 32;
+   const int cross_thread_read =
+      (interface->cross_thread_curbe_read_length + 31) / 32;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(interface->curbe_read_offset % 32 == 0);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 60:
+    *
+    *     "(Constant URB Entry Read Length) [0,63]"
+    */
+   assert(per_thread_read <= 63);
+
+   /* From the Haswell PRM, volume 2d, page 199:
+    *
+    *     "(Cross-Thread Constant Data Read Length) [0,127]"
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      assert(cross_thread_read <= 127);
+   else
+      assert(!cross_thread_read);
+
+   if (per_thread_read || cross_thread_read) {
+      return interface->curbe_read_offset / 32 + cross_thread_read +
+         per_thread_read * interface->thread_group_size;
+   } else {
+      return 0;
+   }
+}
+
+static bool
+compute_validate_gen6(const struct ilo_dev *dev,
+                      const struct ilo_state_compute_info *info,
+                      const struct compute_urb_configuration *urb)
+{
+   int min_curbe_entry_count;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->interface_count <= urb->idrt_entry_count);
+
+   min_curbe_entry_count = 0;
+   for (i = 0; i < info->interface_count; i++) {
+      const int read_end =
+         compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
+
+      if (min_curbe_entry_count < read_end)
+         min_curbe_entry_count = read_end;
+   }
+
+   assert(min_curbe_entry_count <= urb->curbe_entry_count);
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 452:
+    *
+    *     "CURBE Allocation Size should be 0 for GPGPU workloads that uses
+    *      indirect instead of CURBE."
+    */
+   if (!min_curbe_entry_count)
+      assert(!urb->curbe_entry_count);
+
+   return true;
+}
+
+static uint8_t
+compute_get_gen6_scratch_space(const struct ilo_dev *dev,
+                               const struct ilo_state_compute_info *info)
+{
+   uint32_t scratch_size = 0;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->interface_count; i++) {
+      if (scratch_size < info->interfaces[i].scratch_size)
+         scratch_size = info->interfaces[i].scratch_size;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      assert(scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      return (scratch_size > 1024) ?
+         (util_last_bit(scratch_size - 1) - 10): 0;
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      assert(scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 2KB */
+      return (scratch_size > 2048) ?
+         (util_last_bit(scratch_size - 1) - 11): 0;
+   } else {
+      assert(scratch_size <= 12 * 1024);
+
+      return (scratch_size > 1024) ?
+         (scratch_size - 1) / 1024 : 0;
+   }
+}
+
+static bool
+compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_compute_info *info)
+{
+   struct compute_urb_configuration urb;
+   uint8_t scratch_space;
+
+   uint32_t dw1, dw2, dw4;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
+       !compute_validate_gen6(dev, info, &urb))
+      return false;
+
+   scratch_space = compute_get_gen6_scratch_space(dev, info);
+
+   dw1 = scratch_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
+         urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
+         GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
+         GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
+      dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
+
+   assert(urb.urb_entry_size);
+
+   dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
+         urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
+   compute->vfe[0] = dw1;
+   compute->vfe[1] = dw2;
+   compute->vfe[2] = dw4;
+
+   return true;
+}
+
+static uint8_t
+compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+   return (interface->sampler_count <= 12) ?
+      (interface->sampler_count + 3) / 4 : 4;
+}
+
+static uint8_t
+compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+   return (interface->surface_count <= 31) ? interface->surface_count : 31;
+}
+
+static uint8_t
+compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
+                                    const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 61:
+    *
+    *     "The amount is specified in 4k blocks, but only powers of 2 are
+    *      allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
+    */
+   assert(interface->slm_size <= 64 * 1024);
+
+   return util_next_power_of_two((interface->slm_size + 4095) / 4096);
+}
+
+static bool
+compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_compute_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->interface_count; i++) {
+      const struct ilo_state_compute_interface_info *interface =
+         &info->interfaces[i];
+      uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
+      uint8_t sampler_count, surface_count;
+      uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
+
+      assert(interface->kernel_offset % 64 == 0);
+      assert(interface->thread_group_size);
+
+      read_offset = interface->curbe_read_offset / 32;
+      per_thread_read_len = (interface->curbe_read_length + 31) / 32;
+      cross_thread_read_len =
+         (interface->cross_thread_curbe_read_length + 31) / 32;
+
+      sampler_count =
+         compute_interface_get_gen6_sampler_count(dev, interface);
+      surface_count =
+         compute_interface_get_gen6_surface_count(dev, interface);
+
+      dw0 = interface->kernel_offset;
+      dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
+      dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
+      dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
+            read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
+
+      dw5 = 0;
+      dw6 = 0;
+      if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+         const uint8_t slm_size =
+            compute_interface_get_gen7_slm_size(dev, interface);
+
+         dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
+
+         if (slm_size) {
+            dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
+                   slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
+         }
+
+         /*
+          * From the Haswell PRM, volume 2d, page 199:
+          *
+          *     "(Number of Threads in GPGPU Thread Group) Specifies the
+          *      number of threads that are in this thread group.  Used to
+          *      program the barrier for the number of messages to expect. The
+          *      minimum value is 0 (which will disable the barrier), while
+          *      the maximum value is the number of threads in a subslice for
+          *      local barriers."
+          *
+          * From the Broadwell PRM, volume 2d, page 183:
+          *
+          *     "(Number of Threads in GPGPU Thread Group) Specifies the
+          *      number of threads that are in this thread group.  The minimum
+          *      value is 1, while the maximum value is the number of threads
+          *      in a subslice for local barriers. See vol1b Configurations
+          *      for the number of threads per subslice for different
+          *      products.  The maximum value for global barriers is limited
+          *      by the number of threads in the system, or by 511, whichever
+          *      is lower. This field should not be set to 0 even if the
+          *      barrier is disabled, since an accurate value is needed for
+          *      proper pre-emption."
+          */
+         if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
+            dw5 |= interface->thread_group_size <<
+               GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
+         }
+
+         if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+            dw6 |= cross_thread_read_len <<
+               GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
+         }
+      }
+
+      STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
+      compute->idrt[i][0] = dw0;
+      compute->idrt[i][1] = dw2;
+      compute->idrt[i][2] = dw3;
+      compute->idrt[i][3] = dw4;
+      compute->idrt[i][4] = dw5;
+      compute->idrt[i][5] = dw6;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_compute_init(struct ilo_state_compute *compute,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_compute_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(compute, sizeof(*compute)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   assert(ilo_state_compute_data_size(dev, info->interface_count) <=
+         info->data_size);
+   compute->idrt = (uint32_t (*)[6]) info->data;
+
+   ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
+   ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.h b/src/gallium/drivers/ilo/core/ilo_state_compute.h
new file mode 100644
index 00000000000..346f7b617f4
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.h
@@ -0,0 +1,92 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_COMPUTE_H
+#define ILO_STATE_COMPUTE_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Haswell PRM, volume 7, page 836:
+ *
+ *     "The first 64 URB entries are reserved for the interface
+ *      description..."
+ */
+#define ILO_STATE_COMPUTE_MAX_INTERFACE_COUNT 64
+
+struct ilo_state_compute_interface_info {
+   /* usually 0 unless there are multiple interfaces */
+   uint32_t kernel_offset;
+
+   uint32_t scratch_size;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+
+   uint16_t thread_group_size;
+   uint32_t slm_size;
+
+   uint16_t curbe_read_offset;
+   uint16_t curbe_read_length;
+   uint16_t cross_thread_curbe_read_length;
+};
+
+struct ilo_state_compute_info {
+   void *data;
+   size_t data_size;
+
+   const struct ilo_state_compute_interface_info *interfaces;
+   uint8_t interface_count;
+
+   uint32_t cv_urb_alloc_size;
+   uint32_t curbe_alloc_size;
+};
+
+struct ilo_state_compute {
+   uint32_t vfe[3];
+
+   uint32_t (*idrt)[6];
+   uint8_t idrt_count;
+};
+
+static inline size_t
+ilo_state_compute_data_size(const struct ilo_dev *dev,
+                            uint8_t interface_count)
+{
+   const struct ilo_state_compute *compute = NULL;
+   return sizeof(compute->idrt[0]) * interface_count;
+}
+
+bool
+ilo_state_compute_init(struct ilo_state_compute *compute,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_compute_info *info);
+
+#endif /* ILO_STATE_COMPUTE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c
new file mode 100644
index 00000000000..ed64a1f0d3c
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c
@@ -0,0 +1,1252 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_raster.h"
+
+static bool
+raster_validate_gen6_clip(const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(clip->viewport_count);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 188:
+    *
+    *     ""Clip Distance Cull Test Enable Bitmask" and "Clip Distance Clip
+    *      Test Enable Bitmask" should not have overlapping bits in the mask,
+    *      else the results are undefined."
+    */
+   assert(!(clip->user_cull_enables & clip->user_clip_enables));
+
+   if (ilo_dev_gen(dev) < ILO_GEN(9))
+      assert(clip->z_near_enable == clip->z_far_enable);
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3DSTATE_CLIP(struct ilo_state_raster *rs,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!raster_validate_gen6_clip(dev, info))
+      return false;
+
+   dw1 = clip->user_cull_enables << GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT;
+
+   if (clip->stats_enable)
+      dw1 |= GEN6_CLIP_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 219:
+       *
+       *     "Workaround : Due to Hardware issue "EarlyCull" needs to be
+       *      enabled only for the cases where the incoming primitive topology
+       *      into the clipper guaranteed to be Trilist."
+       *
+       * What does this mean?
+       */
+      dw1 |= GEN7_CLIP_DW1_SUBPIXEL_8BITS |
+             GEN7_CLIP_DW1_EARLY_CULL_ENABLE;
+
+      if (ilo_dev_gen(dev) <= ILO_GEN(7.5)) {
+         dw1 |= tri->front_winding << GEN7_CLIP_DW1_FRONT_WINDING__SHIFT |
+                tri->cull_mode << GEN7_CLIP_DW1_CULL_MODE__SHIFT;
+      }
+   }
+
+   dw2 = clip->user_clip_enables << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT |
+         GEN6_CLIPMODE_NORMAL << GEN6_CLIP_DW2_CLIP_MODE__SHIFT;
+
+   if (clip->clip_enable)
+      dw2 |= GEN6_CLIP_DW2_CLIP_ENABLE;
+
+   if (clip->z_near_zero)
+      dw2 |= GEN6_CLIP_DW2_APIMODE_D3D;
+   else
+      dw2 |= GEN6_CLIP_DW2_APIMODE_OGL;
+
+   if (clip->xy_test_enable)
+      dw2 |= GEN6_CLIP_DW2_XY_TEST_ENABLE;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && clip->z_near_enable)
+      dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE;
+
+   if (clip->gb_test_enable)
+      dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE;
+
+   if (scan->barycentric_interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL |
+                                    GEN6_INTERP_NONPERSPECTIVE_CENTROID |
+                                    GEN6_INTERP_NONPERSPECTIVE_SAMPLE))
+      dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
+             0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
+             1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
+             1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
+             2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT |
+         0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT |
+         (clip->viewport_count - 1) << GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT;
+
+   if (clip->force_rtaindex_zero)
+      dw3 |= GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->clip) >= 3);
+   rs->clip[0] = dw1;
+   rs->clip[1] = dw2;
+   rs->clip[2] = dw3;
+
+   return true;
+}
+
+static bool
+raster_params_is_gen6_line_aa_allowed(const struct ilo_dev *dev,
+                                      const struct ilo_state_raster_params_info *params)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "This field (Anti-aliasing Enable) must be disabled if any of the
+    *      render targets have integer (UINT or SINT) surface format."
+    */
+   if (params->any_integer_rt)
+      return false;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+    *
+    *     "[DevSNB+]: This field (Hierarchical Depth Buffer Enable) must be
+    *      disabled if Anti-aliasing Enable in 3DSTATE_SF is enabled.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && params->hiz_enable)
+      return false;
+
+   return true;
+}
+
+static void
+raster_get_gen6_effective_line(const struct ilo_dev *dev,
+                               const struct ilo_state_raster_info *info,
+                               struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_params_info *params = &info->params;
+
+   *line = info->line;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "This field (Anti-aliasing Enable) is ignored when Multisample
+    *      Rasterization Mode is MSRASTMODE_ON_xx."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "Setting a Line Width of 0.0 specifies the rasterization of the
+    *      "thinnest" (one-pixel-wide), non-antialiased lines. Note that
+    *      this effectively overrides the effect of AAEnable (though the
+    *      AAEnable state variable is not modified). Lines rendered with
+    *      zero Line Width are rasterized using GIQ (Grid Intersection
+    *      Quantization) rules as specified by the GDI and Direct3D APIs."
+    *
+    *     "Software must not program a value of 0.0 when running in
+    *      MSRASTMODE_ON_xxx modes - zero-width lines are not available
+    *      when multisampling rasterization is enabled."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 294:
+    *
+    *     "Line stipple, controlled via the Line Stipple Enable state variable
+    *      in WM_STATE, discards certain pixels that are produced by non-AA
+    *      line rasterization."
+    */
+   if (setup->line_msaa_enable ||
+       !raster_params_is_gen6_line_aa_allowed(dev, params))
+      line->aa_enable = false;
+   if (setup->line_msaa_enable || line->aa_enable) {
+      line->stipple_enable = false;
+      line->giq_enable = false;
+      line->giq_last_pixel = false;
+   }
+}
+
+static bool
+raster_validate_gen8_raster(const struct ilo_dev *dev,
+                            const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 249:
+    *
+    *     "This setting (SOLID) is required when rendering rectangle
+    *      (RECTLIST) objects.
+    */
+   if (tri->fill_mode_front != GEN6_FILLMODE_SOLID ||
+       tri->fill_mode_back != GEN6_FILLMODE_SOLID)
+      assert(!setup->cv_is_rectangle);
+
+   return true;
+}
+
+static enum gen_msrast_mode
+raster_setup_get_gen6_msrast_mode(const struct ilo_dev *dev,
+                                  const struct ilo_state_raster_setup_info *setup)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (setup->line_msaa_enable) {
+      return (setup->msaa_enable) ? GEN6_MSRASTMODE_ON_PATTERN :
+                                    GEN6_MSRASTMODE_ON_PIXEL;
+   } else {
+      return (setup->msaa_enable) ? GEN6_MSRASTMODE_OFF_PATTERN :
+                                    GEN6_MSRASTMODE_OFF_PIXEL;
+   }
+}
+
+static int
+get_gen6_line_width(const struct ilo_dev *dev, float fwidth,
+                    bool line_aa_enable, bool line_giq_enable)
+{
+   int line_width;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* in U3.7 */
+   line_width = (int) (fwidth * 128.0f + 0.5f);
+
+   /*
+    * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1)
+    * pixels in the minor direction.  We have to make the lines slightly
+    * thicker, 0.5 pixel on both sides, so that they intersect that many
+    * pixels.
+    */
+   if (line_aa_enable)
+      line_width += 128;
+
+   line_width = CLAMP(line_width, 1, 1023);
+
+   if (line_giq_enable && line_width == 128)
+      line_width = 0;
+
+   return line_width;
+}
+
+static int
+get_gen6_point_width(const struct ilo_dev *dev, float fwidth)
+{
+   int point_width;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* in U8.3 */
+   point_width = (int) (fwidth * 8.0f + 0.5f);
+   point_width = CLAMP(point_width, 1, 2047);
+
+   return point_width;
+}
+
+static bool
+raster_set_gen7_3DSTATE_SF(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line->aa_enable, line->giq_enable);
+   const int point_width = get_gen6_point_width(dev, params->point_width);
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!raster_validate_gen8_raster(dev, info))
+      return false;
+
+   dw1 = tri->fill_mode_front << GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT |
+         tri->fill_mode_back << GEN7_SF_DW1_FILL_MODE_BACK__SHIFT |
+         tri->front_winding << GEN7_SF_DW1_FRONT_WINDING__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) {
+      enum gen_depth_format format;
+
+      /* do it here as we want 0x0 to be valid */
+      switch (tri->depth_offset_format) {
+      case GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT:
+         format = GEN6_ZFORMAT_D32_FLOAT;
+         break;
+      case GEN6_ZFORMAT_D24_UNORM_S8_UINT:
+         format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+         break;
+      default:
+         format = tri->depth_offset_format;
+         break;
+      }
+
+      dw1 |= format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "This bit (Statistics Enable) should be set whenever clipping is
+    *      enabled and the Statistics Enable bit is set in CLIP_STATE. It
+    *      should be cleared if clipping is disabled or Statistics Enable in
+    *      CLIP_STATE is clear."
+    */
+   if (clip->stats_enable && clip->clip_enable)
+      dw1 |= GEN7_SF_DW1_STATISTICS;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 258:
+    *
+    *     "This bit (Legacy Global Depth Bias Enable, Global Depth Offset
+    *      Enable Solid , Global Depth Offset Enable Wireframe, and Global
+    *      Depth Offset Enable Point) should be set whenever non zero depth
+    *      bias (Slope, Bias) values are used. Setting this bit may have some
+    *      degradation of performance for some workloads."
+    *
+    * But it seems fine to ignore that.
+    */
+   if (tri->depth_offset_solid)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID;
+   if (tri->depth_offset_wireframe)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME;
+   if (tri->depth_offset_point)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT;
+
+   if (setup->viewport_transform)
+      dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM;
+
+   dw2 = tri->cull_mode << GEN7_SF_DW2_CULL_MODE__SHIFT |
+         line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+         GEN7_SF_DW2_AA_LINE_CAP_1_0 |
+         msrast << GEN7_SF_DW2_MSRASTMODE__SHIFT;
+
+   if (line->aa_enable)
+      dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && line->stipple_enable)
+      dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE;
+
+   if (setup->scissor_enable)
+      dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;
+
+   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
+         GEN7_SF_DW3_SUBPIXEL_8BITS;
+
+   /* this has no effect when line_width != 0 */
+   if (line->giq_last_pixel)
+      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   /* setup->point_aa_enable is ignored */
+   if (!point->programmable_width) {
+      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH |
+             point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3);
+   rs->sf[0] = dw1;
+   rs->sf[1] = dw2;
+   rs->sf[2] = dw3;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4);
+   rs->raster[0] = 0;
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   rs->line_aa_enable = line->aa_enable;
+   rs->line_giq_enable = line->giq_enable;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_SF(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line->aa_enable, line->giq_enable);
+   const int point_width = get_gen6_point_width(dev, params->point_width);
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (clip->stats_enable && clip->clip_enable)
+      dw1 |= GEN7_SF_DW1_STATISTICS;
+
+   if (setup->viewport_transform)
+      dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM;
+
+   dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+         GEN7_SF_DW2_AA_LINE_CAP_1_0;
+
+   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
+         GEN7_SF_DW3_SUBPIXEL_8BITS;
+
+   /* this has no effect when line_width != 0 */
+   if (line->giq_last_pixel)
+      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   if (!point->programmable_width) {
+      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH |
+             point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3);
+   rs->sf[0] = dw1;
+   rs->sf[1] = dw2;
+   rs->sf[2] = dw3;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_raster_info *info,
+                               const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!raster_validate_gen8_raster(dev, info))
+      return false;
+
+   dw1 = tri->front_winding << GEN8_RASTER_DW1_FRONT_WINDING__SHIFT |
+         tri->cull_mode << GEN8_RASTER_DW1_CULL_MODE__SHIFT |
+         tri->fill_mode_front << GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT |
+         tri->fill_mode_back << GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT;
+
+   if (point->aa_enable)
+      dw1 |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE;
+
+   /* where should line_msaa_enable be set? */
+   if (setup->msaa_enable)
+      dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE;
+
+   if (tri->depth_offset_solid)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID;
+   if (tri->depth_offset_wireframe)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME;
+   if (tri->depth_offset_point)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT;
+
+   if (line->aa_enable)
+      dw1 |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
+
+   if (setup->scissor_enable)
+      dw1 |= GEN8_RASTER_DW1_SCISSOR_ENABLE;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(9)) {
+      if (clip->z_far_enable)
+         dw1 |= GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE;
+      if (clip->z_near_enable)
+         dw1 |= GEN9_RASTER_DW1_Z_TEST_NEAR_ENABLE;
+   } else {
+      if (clip->z_near_enable)
+         dw1 |= GEN8_RASTER_DW1_Z_TEST_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4);
+   rs->raster[0] = dw1;
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   rs->line_aa_enable = line->aa_enable;
+   rs->line_giq_enable = line->giq_enable;
+
+   return true;
+}
+
+static enum gen_sample_count
+get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count)
+{
+   enum gen_sample_count c;
+   int min_gen;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (sample_count) {
+   case 1:
+      c = GEN6_NUMSAMPLES_1;
+      min_gen = ILO_GEN(6);
+      break;
+   case 2:
+      c = GEN8_NUMSAMPLES_2;
+      min_gen = ILO_GEN(8);
+      break;
+   case 4:
+      c = GEN6_NUMSAMPLES_4;
+      min_gen = ILO_GEN(6);
+      break;
+   case 8:
+      c = GEN7_NUMSAMPLES_8;
+      min_gen = ILO_GEN(7);
+      break;
+   case 16:
+      c = GEN8_NUMSAMPLES_16;
+      min_gen = ILO_GEN(8);
+      break;
+   default:
+      assert(!"unexpected sample count");
+      c = GEN6_NUMSAMPLES_1;
+      break;
+   }
+
+   assert(ilo_dev_gen(dev) >= min_gen);
+
+   return c;
+}
+
+static bool
+raster_set_gen8_3DSTATE_MULTISAMPLE(struct ilo_state_raster *rs,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_sample_count count =
+      get_gen6_sample_count(dev, scan->sample_count);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 307:
+    *
+    *     "Setting Multisample Rasterization Mode to MSRASTMODE_xxx_PATTERN
+    *      when Number of Multisamples == NUMSAMPLES_1 is UNDEFINED."
+    */
+   if (setup->msaa_enable)
+      assert(scan->sample_count > 1);
+
+   dw1 = scan->pixloc << GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT |
+         count << GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 1);
+   rs->sample[0] = dw1;
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3DSTATE_SAMPLE_MASK(struct ilo_state_raster *rs,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 294:
+    *
+    *     "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field
+    *      (Sample Mask) must be zero.
+    *
+    *      If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field
+    *      must be zero."
+    */
+   const uint32_t mask = (1 << scan->sample_count) - 1;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dw1 = (scan->sample_mask & mask) << GEN6_SAMPLE_MASK_DW1_VAL__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 2);
+   rs->sample[1] = dw1;
+
+   return true;
+}
+
+static bool
+raster_validate_gen6_wm(const struct ilo_dev *dev,
+                        const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      assert(scan->earlyz_control == GEN7_EDSC_NORMAL);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 272:
+    *
+    *     "This bit (Statistics Enable) must be disabled if either of these
+    *      bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve
+    *      Enable or Depth Buffer Resolve Enable."
+    */
+   if (scan->earlyz_op != ILO_STATE_RASTER_EARLYZ_NORMAL)
+      assert(!scan->stats_enable);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 273:
+    *
+    *     "If this field (Depth Buffer Resolve Enable) is enabled, the Depth
+    *      Buffer Clear and Hierarchical Depth Buffer Resolve Enable fields
+    *      must both be disabled."
+    *
+    *     "If this field (Hierarchical Depth Buffer Resolve Enable) is
+    *      enabled, the Depth Buffer Clear and Depth Buffer Resolve Enable
+    *      fields must both be disabled."
+    *
+    * This is guaranteed.
+    */
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 314-315:
+    *
+    *     "Stencil buffer clear can be performed at the same time by enabling
+    *      Stencil Buffer Write Enable."
+    *
+    *     "Note also that stencil buffer clear can be performed without depth
+    *      buffer clear."
+    */
+   if (scan->earlyz_stencil_clear) {
+      assert(scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_NORMAL ||
+             scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR);
+   }
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3dstate_wm(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   /* only scan conversion states are set, as in Gen8+ */
+   uint32_t dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!raster_validate_gen6_wm(dev, info))
+      return false;
+
+   dw4 = 0;
+
+   if (scan->stats_enable)
+      dw4 |= GEN6_WM_DW4_STATISTICS;
+
+   switch (scan->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+      dw4 |= GEN6_WM_DW4_DEPTH_CLEAR;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+      dw4 |= GEN6_WM_DW4_DEPTH_RESOLVE;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+      dw4 |= GEN6_WM_DW4_HIZ_RESOLVE;
+      break;
+   default:
+      if (scan->earlyz_stencil_clear)
+         dw4 |= GEN6_WM_DW4_DEPTH_CLEAR;
+      break;
+   }
+
+   dw5 = GEN6_WM_DW5_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */
+         GEN6_WM_DW5_AA_LINE_WIDTH_2_0;
+
+   if (tri->poly_stipple_enable)
+      dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE;
+   if (line->stipple_enable)
+      dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE;
+
+   dw6 = scan->zw_interp << GEN6_WM_DW6_ZW_INTERP__SHIFT |
+         scan->barycentric_interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT |
+         GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT |
+         msrast << GEN6_WM_DW6_MSRASTMODE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3);
+   rs->wm[0] = dw4;
+   rs->wm[1] = dw5;
+   rs->wm[2] = dw6;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!raster_validate_gen6_wm(dev, info))
+      return false;
+
+   dw1 = scan->earlyz_control << GEN7_WM_DW1_EDSC__SHIFT |
+         scan->zw_interp << GEN7_WM_DW1_ZW_INTERP__SHIFT |
+         scan->barycentric_interps << GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT |
+         GEN7_WM_DW1_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */
+         GEN7_WM_DW1_AA_LINE_WIDTH_2_0 |
+         GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
+
+   if (scan->stats_enable)
+      dw1 |= GEN7_WM_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      switch (scan->earlyz_op) {
+      case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+         dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+         break;
+      case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+         dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE;
+         break;
+      case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+         dw1 |= GEN7_WM_DW1_HIZ_RESOLVE;
+         break;
+      default:
+         if (scan->earlyz_stencil_clear)
+            dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+         break;
+      }
+   }
+
+   if (tri->poly_stipple_enable)
+      dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
+   if (line->stipple_enable)
+      dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8))
+      dw1 |= msrast << GEN7_WM_DW1_MSRASTMODE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 1);
+   rs->wm[0] = dw1;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3dstate_wm_hz_op(struct ilo_state_raster *rs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_sample_count count =
+      get_gen6_sample_count(dev, scan->sample_count);
+   const uint32_t mask = (1 << scan->sample_count) - 1;
+   uint32_t dw1, dw4;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = count << GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT;
+
+   if (scan->earlyz_stencil_clear)
+      dw1 |= GEN8_WM_HZ_DW1_STENCIL_CLEAR;
+
+   switch (scan->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+      dw1 |= GEN8_WM_HZ_DW1_DEPTH_CLEAR;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+      dw1 |= GEN8_WM_HZ_DW1_DEPTH_RESOLVE;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+      dw1 |= GEN8_WM_HZ_DW1_HIZ_RESOLVE;
+      break;
+   default:
+      break;
+   }
+
+   dw4 = (scan->sample_mask & mask) << GEN8_WM_HZ_DW4_SAMPLE_MASK__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3);
+   rs->wm[1] = dw1;
+   rs->wm[2] = dw4;
+
+   return true;
+}
+
+static bool
+sample_pattern_get_gen6_packed_offsets(const struct ilo_dev *dev,
+                                       uint8_t sample_count,
+                                       const struct ilo_state_sample_pattern_offset_info *in,
+                                       uint8_t *out)
+{
+   uint8_t max_dist, i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   max_dist = 0;
+   for (i = 0; i < sample_count; i++) {
+      const int8_t dist_x = (int8_t) in[i].x - 8;
+      const int8_t dist_y = (int8_t) in[i].y - 8;
+      const uint8_t dist = dist_x * dist_x + dist_y * dist_y;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 305:
+       *
+       *     "Programming Note: When programming the sample offsets (for
+       *      NUMSAMPLES_4 or _8 and MSRASTMODE_xxx_PATTERN), the order of the
+       *      samples 0 to 3 (or 7 for 8X) must have monotonically increasing
+       *      distance from the pixel center. This is required to get the
+       *      correct centroid computation in the device."
+       */
+      assert(dist >= max_dist);
+      max_dist = dist;
+
+      assert(in[i].x < 16);
+      assert(in[i].y < 16);
+
+      out[i] = in[i].x << 4 | in[i].y;
+   }
+
+   return true;
+}
+
+static bool
+line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(struct ilo_state_line_stipple *stipple,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_line_stipple_info *info)
+{
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->repeat_count >= 1 && info->repeat_count <= 256);
+
+   dw1 = info->pattern;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /* in U1.16 */
+      const uint32_t inverse = 65536 / info->repeat_count;
+      dw2 = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
+            info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT;
+   } else {
+      /* in U1.13 */
+      const uint16_t inverse = 8192 / info->repeat_count;
+      dw2 = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
+            info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 2);
+   stipple->stipple[0] = dw1;
+   stipple->stipple[1] = dw2;
+
+   return true;
+}
+
+static bool
+sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_state_sample_pattern *pattern,
+                                               const struct ilo_dev *dev,
+                                               const struct ilo_state_sample_pattern_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_1x) >= 1);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_2x) >= 2);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_4x) >= 4);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_8x) >= 8);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_16x) >= 16);
+
+   return (sample_pattern_get_gen6_packed_offsets(dev, 1,
+              info->pattern_1x, pattern->pattern_1x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 2,
+              info->pattern_2x, pattern->pattern_2x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 4,
+              info->pattern_4x, pattern->pattern_4x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 8,
+              info->pattern_8x, pattern->pattern_8x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 16,
+              info->pattern_16x, pattern->pattern_16x));
+
+}
+
+static bool
+poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_state_poly_stipple *stipple,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_poly_stipple_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 32);
+   memcpy(stipple->stipple, info->pattern, sizeof(info->pattern));
+
+   return true;
+}
+
+bool
+ilo_state_raster_init(struct ilo_state_raster *rs,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_raster_info *info)
+{
+   assert(ilo_is_zeroed(rs, sizeof(*rs)));
+   return ilo_state_raster_set_info(rs, dev, info);
+}
+
+bool
+ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs,
+                                   const struct ilo_dev *dev,
+                                   uint8_t sample_count,
+                                   enum ilo_state_raster_earlyz_op earlyz_op,
+                                   bool earlyz_stencil_clear)
+{
+   struct ilo_state_raster_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   info.clip.viewport_count = 1;
+   info.setup.cv_is_rectangle = true;
+   info.setup.msaa_enable = (sample_count > 1);
+   info.scan.sample_count = sample_count;
+   info.scan.sample_mask = ~0u;
+   info.scan.earlyz_op = earlyz_op;
+   info.scan.earlyz_stencil_clear = earlyz_stencil_clear;
+
+   return ilo_state_raster_init(rs, dev, &info);
+}
+
+bool
+ilo_state_raster_set_info(struct ilo_state_raster *rs,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info)
+{
+   struct ilo_state_raster_line_info line;
+   bool ret = true;
+
+   ret &= raster_set_gen6_3DSTATE_CLIP(rs, dev, info);
+
+   raster_get_gen6_effective_line(dev, info, &line);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= raster_set_gen8_3DSTATE_SF(rs, dev, info, &line);
+      ret &= raster_set_gen8_3DSTATE_RASTER(rs, dev, info, &line);
+   } else {
+      ret &= raster_set_gen7_3DSTATE_SF(rs, dev, info, &line);
+   }
+
+   ret &= raster_set_gen8_3DSTATE_MULTISAMPLE(rs, dev, info);
+   ret &= raster_set_gen6_3DSTATE_SAMPLE_MASK(rs, dev, info);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= raster_set_gen8_3DSTATE_WM(rs, dev, info, &line);
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         ret &= raster_set_gen8_3dstate_wm_hz_op(rs, dev, info);
+   } else {
+      ret &= raster_set_gen6_3dstate_wm(rs, dev, info, &line);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_raster_set_params(struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            const struct ilo_state_raster_params_info *params)
+{
+   const bool line_aa_enable = (rs->line_aa_enable &&
+         raster_params_is_gen6_line_aa_allowed(dev, params));
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line_aa_enable, rs->line_giq_enable);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* modify line AA enable */
+   if (rs->line_aa_enable) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         if (line_aa_enable)
+            rs->raster[0] |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
+         else
+            rs->raster[0] &= ~GEN8_RASTER_DW1_AA_LINE_ENABLE;
+      } else {
+         if (line_aa_enable)
+            rs->sf[1] |= GEN7_SF_DW2_AA_LINE_ENABLE;
+         else
+            rs->sf[1] &= ~GEN7_SF_DW2_AA_LINE_ENABLE;
+      }
+   }
+
+   /* modify line width */
+   rs->sf[1] = (rs->sf[1] & ~GEN7_SF_DW2_LINE_WIDTH__MASK) |
+               line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
+
+   /* modify point width */
+   if (rs->sf[2] & GEN7_SF_DW3_USE_POINT_WIDTH) {
+      const int point_width = get_gen6_point_width(dev, params->point_width);
+
+      rs->sf[2] = (rs->sf[2] & ~GEN7_SF_DW3_POINT_WIDTH__MASK) |
+                  point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   /* modify depth offset */
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   return true;
+}
+
+void
+ilo_state_raster_full_delta(const struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            struct ilo_state_raster_delta *delta)
+{
+   delta->dirty = ILO_STATE_RASTER_3DSTATE_CLIP |
+                  ILO_STATE_RASTER_3DSTATE_SF |
+                  ILO_STATE_RASTER_3DSTATE_MULTISAMPLE |
+                  ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK |
+                  ILO_STATE_RASTER_3DSTATE_WM |
+                  ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER |
+                      ILO_STATE_RASTER_3DSTATE_WM_HZ_OP;
+   }
+}
+
+void
+ilo_state_raster_get_delta(const struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster *old,
+                           struct ilo_state_raster_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (memcmp(rs->clip, old->clip, sizeof(rs->clip)))
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_CLIP;
+
+   if (memcmp(rs->sf, old->sf, sizeof(rs->sf)))
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF;
+
+   if (memcmp(rs->raster, old->raster, sizeof(rs->raster))) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER;
+      else
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF;
+   }
+
+   if (memcmp(rs->sample, old->sample, sizeof(rs->sample))) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_MULTISAMPLE |
+                      ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK;
+   }
+
+   if (memcmp(rs->wm, old->wm, sizeof(rs->wm))) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM;
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM_HZ_OP;
+   }
+}
+
+bool
+ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sample_pattern_info *info)
+{
+   bool ret = true;
+
+   ret &= sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(pattern, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern,
+                                      const struct ilo_dev *dev)
+{
+   static const struct ilo_state_sample_pattern_info default_info = {
+      .pattern_1x = {
+         {  8,  8 },
+      },
+
+      .pattern_2x = {
+         {  4,  4 }, { 12, 12 },
+      },
+
+      .pattern_4x = {
+         {  6,  2 }, { 14,  6 }, {  2, 10 }, { 10, 14 },
+      },
+
+      /* \see brw_multisample_positions_8x */
+      .pattern_8x = {
+         {  7,  9 }, {  9, 13 }, { 11,  3 }, { 13, 11 },
+         {  1,  7 }, {  5,  1 }, { 15,  5 }, {  3, 15 },
+      },
+
+      .pattern_16x = {
+         {  8, 10 }, { 11,  8 }, {  5,  6 }, {  6,  4 },
+         { 12, 11 }, { 13,  9 }, { 14,  7 }, { 10,  2 },
+         {  4, 13 }, {  3,  3 }, {  7,  1 }, { 15,  5 },
+         {  1, 12 }, {  9,  0 }, {  2, 14 }, {  0, 15 },
+      },
+   };
+
+   return ilo_state_sample_pattern_init(pattern, dev, &default_info);
+}
+
+const uint8_t *
+ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern,
+                                            const struct ilo_dev *dev,
+                                            uint8_t sample_count)
+{
+   switch (sample_count) {
+   case 1:  return pattern->pattern_1x;
+   case 2:  return pattern->pattern_2x;
+   case 4:  return pattern->pattern_4x;
+   case 8:  return pattern->pattern_8x;
+   case 16: return pattern->pattern_16x;
+   default:
+      assert(!"unknown sample count");
+      return NULL;
+   }
+}
+
+void
+ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern,
+                                    const struct ilo_dev *dev,
+                                    uint8_t sample_count, uint8_t sample_index,
+                                    uint8_t *x, uint8_t *y)
+{
+   const const uint8_t *packed =
+      ilo_state_sample_pattern_get_packed_offsets(pattern, dev, sample_count);
+
+   assert(sample_index < sample_count);
+
+   *x = (packed[sample_index] >> 4) & 0xf;
+   *y = packed[sample_index] & 0xf;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_line_stipple_info *info)
+{
+   bool ret = true;
+
+   ret &= line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(stipple,
+         dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_poly_stipple_info *info)
+{
+   bool ret = true;
+
+   ret &= poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(stipple,
+         dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.h b/src/gallium/drivers/ilo/core/ilo_state_raster.h
new file mode 100644
index 00000000000..fc90b49cfc3
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_raster.h
@@ -0,0 +1,301 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_RASTER_H
+#define ILO_STATE_RASTER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+enum ilo_state_raster_dirty_bits {
+   ILO_STATE_RASTER_3DSTATE_CLIP                   = (1 << 0),
+   ILO_STATE_RASTER_3DSTATE_SF                     = (1 << 1),
+   ILO_STATE_RASTER_3DSTATE_RASTER                 = (1 << 2),
+   ILO_STATE_RASTER_3DSTATE_MULTISAMPLE            = (1 << 3),
+   ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK            = (1 << 4),
+   ILO_STATE_RASTER_3DSTATE_WM                     = (1 << 5),
+   ILO_STATE_RASTER_3DSTATE_WM_HZ_OP               = (1 << 6),
+   ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS     = (1 << 7),
+};
+
+enum ilo_state_raster_earlyz_op {
+   ILO_STATE_RASTER_EARLYZ_NORMAL,
+   ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR,
+   ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE,
+   ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE,
+};
+
+/**
+ * VUE readback, VertexClipTest, ClipDetermination, and primitive output.
+ */
+struct ilo_state_raster_clip_info {
+   bool clip_enable;
+   /* CL_INVOCATION_COUNT and CL_PRIMITIVES_COUNT */
+   bool stats_enable;
+
+   uint8_t viewport_count;
+   bool force_rtaindex_zero;
+
+   /* these should be mutually exclusive */
+   uint8_t user_cull_enables;
+   uint8_t user_clip_enables;
+
+   bool gb_test_enable;
+   bool xy_test_enable;
+
+   /* far/near must be enabled together prior to Gen9 */
+   bool z_far_enable;
+   bool z_near_enable;
+   bool z_near_zero;
+};
+
+/**
+ * Primitive assembly, viewport transformation, scissoring, MSAA, etc.
+ */
+struct ilo_state_raster_setup_info {
+   bool cv_is_rectangle;
+
+   bool first_vertex_provoking;
+   bool viewport_transform;
+
+   bool scissor_enable;
+
+   /* MSAA enables for lines and non-lines */
+   bool msaa_enable;
+   bool line_msaa_enable;
+};
+
+/**
+ * 3DOBJ_POINT rasterization rules.
+ */
+struct ilo_state_raster_point_info {
+   /* ignored when msaa_enable is set */
+   bool aa_enable;
+
+   bool programmable_width;
+};
+
+/**
+ * 3DOBJ_LINE rasterization rules.
+ */
+struct ilo_state_raster_line_info {
+   /* ignored when line_msaa_enable is set */
+   bool aa_enable;
+
+   /* ignored when line_msaa_enable or aa_enable is set */
+   bool stipple_enable;
+   bool giq_enable;
+   bool giq_last_pixel;
+};
+
+/**
+ * 3DOBJ_TRIANGLE rasterization rules.
+ */
+struct ilo_state_raster_tri_info {
+   enum gen_front_winding front_winding;
+   enum gen_cull_mode cull_mode;
+   enum gen_fill_mode fill_mode_front;
+   enum gen_fill_mode fill_mode_back;
+
+   enum gen_depth_format depth_offset_format;
+   bool depth_offset_solid;
+   bool depth_offset_wireframe;
+   bool depth_offset_point;
+
+   bool poly_stipple_enable;
+};
+
+/**
+ * Scan conversion.
+ */
+struct ilo_state_raster_scan_info {
+   /* PS_DEPTH_COUNT and PS_INVOCATION_COUNT */
+   bool stats_enable;
+
+   uint8_t sample_count;
+
+   /* pixel location for non-MSAA or 1x-MSAA */
+   enum gen_pixel_location pixloc;
+
+   uint32_t sample_mask;
+
+   /* interpolations */
+   enum gen_zw_interp zw_interp;
+   uint8_t barycentric_interps;
+
+   /* Gen7+ only */
+   enum gen_edsc_mode earlyz_control;
+   enum ilo_state_raster_earlyz_op earlyz_op;
+   bool earlyz_stencil_clear;
+};
+
+/**
+ * Raster parameters.
+ */
+struct ilo_state_raster_params_info {
+   bool any_integer_rt;
+   bool hiz_enable;
+
+   float point_width;
+   float line_width;
+
+   /* const term will be scaled by 'r' */
+   float depth_offset_const;
+   float depth_offset_scale;
+   float depth_offset_clamp;
+};
+
+struct ilo_state_raster_info {
+   struct ilo_state_raster_clip_info clip;
+   struct ilo_state_raster_setup_info setup;
+   struct ilo_state_raster_point_info point;
+   struct ilo_state_raster_line_info line;
+   struct ilo_state_raster_tri_info tri;
+   struct ilo_state_raster_scan_info scan;
+
+   struct ilo_state_raster_params_info params;
+};
+
+struct ilo_state_raster {
+   uint32_t clip[3];
+   uint32_t sf[3];
+   uint32_t raster[4];
+   uint32_t sample[2];
+   uint32_t wm[3];
+
+   bool line_aa_enable;
+   bool line_giq_enable;
+};
+
+struct ilo_state_raster_delta {
+   uint32_t dirty;
+};
+
+struct ilo_state_sample_pattern_offset_info {
+   /* in U0.4 */
+   uint8_t x;
+   uint8_t y;
+};
+
+struct ilo_state_sample_pattern_info {
+   struct ilo_state_sample_pattern_offset_info pattern_1x[1];
+   struct ilo_state_sample_pattern_offset_info pattern_2x[2];
+   struct ilo_state_sample_pattern_offset_info pattern_4x[4];
+   struct ilo_state_sample_pattern_offset_info pattern_8x[8];
+   struct ilo_state_sample_pattern_offset_info pattern_16x[16];
+};
+
+struct ilo_state_sample_pattern {
+   uint8_t pattern_1x[1];
+   uint8_t pattern_2x[2];
+   uint8_t pattern_4x[4];
+   uint8_t pattern_8x[8];
+   uint8_t pattern_16x[16];
+};
+
+struct ilo_state_line_stipple_info {
+   uint16_t pattern;
+   uint16_t repeat_count;
+};
+
+struct ilo_state_line_stipple {
+   uint32_t stipple[2];
+};
+
+struct ilo_state_poly_stipple_info {
+   uint32_t pattern[32];
+};
+
+struct ilo_state_poly_stipple {
+   uint32_t stipple[32];
+};
+
+bool
+ilo_state_raster_init(struct ilo_state_raster *rs,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_raster_info *info);
+
+bool
+ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs,
+                                   const struct ilo_dev *dev,
+                                   uint8_t sample_count,
+                                   enum ilo_state_raster_earlyz_op earlyz_op,
+                                   bool earlyz_stencil_clear);
+
+bool
+ilo_state_raster_set_info(struct ilo_state_raster *rs,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info);
+
+bool
+ilo_state_raster_set_params(struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            const struct ilo_state_raster_params_info *params);
+
+void
+ilo_state_raster_full_delta(const struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            struct ilo_state_raster_delta *delta);
+
+void
+ilo_state_raster_get_delta(const struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster *old,
+                           struct ilo_state_raster_delta *delta);
+
+bool
+ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sample_pattern_info *info);
+
+bool
+ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern,
+                                      const struct ilo_dev *dev);
+
+const uint8_t *
+ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern,
+                                            const struct ilo_dev *dev,
+                                            uint8_t sample_count);
+
+void
+ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern,
+                                    const struct ilo_dev *dev,
+                                    uint8_t sample_count, uint8_t sample_index,
+                                    uint8_t *x, uint8_t *y);
+bool
+ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_line_stipple_info *info);
+
+bool
+ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_poly_stipple_info *info);
+
+#endif /* ILO_STATE_RASTER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.c b/src/gallium/drivers/ilo/core/ilo_state_sampler.c
new file mode 100644
index 00000000000..3787f684fe8
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.c
@@ -0,0 +1,742 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "util/u_half.h"
+
+#include "ilo_debug.h"
+#include "ilo_state_surface.h"
+#include "ilo_state_sampler.h"
+
+static bool
+sampler_validate_gen6_non_normalized(const struct ilo_dev *dev,
+                                     const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode addr_ctrls[3] = {
+      info->tcx_ctrl, info->tcy_ctrl, info->tcz_ctrl,
+   };
+   int i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 98:
+    *
+    *     "The following state must be set as indicated if this field
+    *      (Non-normalized Coordinate Enable) is enabled:
+    *
+    *      - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP,
+    *        TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER.
+    *      - Surface Type must be SURFTYPE_2D or SURFTYPE_3D.
+    *      - Mag Mode Filter must be MAPFILTER_NEAREST or
+    *        MAPFILTER_LINEAR.
+    *      - Min Mode Filter must be MAPFILTER_NEAREST or
+    *        MAPFILTER_LINEAR.
+    *      - Mip Mode Filter must be MIPFILTER_NONE.
+    *      - Min LOD must be 0.
+    *      - Max LOD must be 0.
+    *      - MIP Count must be 0.
+    *      - Surface Min LOD must be 0.
+    *      - Texture LOD Bias must be 0."
+    */
+   for (i = 0; i < 3; i++) {
+      switch (addr_ctrls[i]) {
+      case GEN6_TEXCOORDMODE_CLAMP:
+      case GEN6_TEXCOORDMODE_CLAMP_BORDER:
+      case GEN8_TEXCOORDMODE_HALF_BORDER:
+         break;
+      default:
+         assert(!"bad non-normalized coordinate wrap mode");
+         break;
+      }
+   }
+
+   assert(info->mip_filter == GEN6_MIPFILTER_NONE);
+
+   assert((info->min_filter == GEN6_MAPFILTER_NEAREST ||
+           info->min_filter == GEN6_MAPFILTER_LINEAR) &&
+          (info->mag_filter == GEN6_MAPFILTER_NEAREST ||
+           info->mag_filter == GEN6_MAPFILTER_LINEAR));
+
+   assert(info->min_lod == 0.0f &&
+          info->max_lod == 0.0f &&
+          info->lod_bias == 0.0f);
+
+   return true;
+}
+
+static bool
+sampler_validate_gen6_sampler(const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->non_normalized &&
+       !sampler_validate_gen6_non_normalized(dev, info))
+      return false;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+       assert(info->tcx_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER &&
+              info->tcy_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER &&
+              info->tcz_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER);
+   }
+
+   return true;
+}
+
+static uint32_t
+sampler_get_gen6_integer_filters(const struct ilo_dev *dev,
+                                 const struct ilo_state_sampler_info *info)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 103:
+    *
+    *     "MIPFILTER_LINEAR is not supported for surface formats that do not
+    *      support "Sampling Engine Filtering" as indicated in the Surface
+    *      Formats table unless using the sample_c message type."
+    *
+    *     "Only MAPFILTER_NEAREST is supported for surface formats that do not
+    *      support "Sampling Engine Filtering" as indicated in the Surface
+    *      Formats table unless using the sample_c message type.
+    */
+   const enum gen_mip_filter mip_filter =
+      (info->mip_filter == GEN6_MIPFILTER_LINEAR) ?
+      GEN6_MIPFILTER_NEAREST : info->mip_filter;
+   const enum gen_map_filter min_filter = GEN6_MAPFILTER_NEAREST;
+   const enum gen_map_filter mag_filter = GEN6_MAPFILTER_NEAREST;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+          mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+          min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+}
+
+static uint32_t
+sampler_get_gen6_3d_filters(const struct ilo_dev *dev,
+                            const struct ilo_state_sampler_info *info)
+{
+   const enum gen_mip_filter mip_filter = info->mip_filter;
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 103:
+    *
+    *     "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for
+    *      surfaces of type SURFTYPE_3D."
+    */
+   const enum gen_map_filter min_filter =
+      (info->min_filter == GEN6_MAPFILTER_NEAREST ||
+       info->min_filter == GEN6_MAPFILTER_LINEAR) ?
+      info->min_filter : GEN6_MAPFILTER_LINEAR;
+   const enum gen_map_filter mag_filter =
+      (info->mag_filter == GEN6_MAPFILTER_NEAREST ||
+       info->mag_filter == GEN6_MAPFILTER_LINEAR) ?
+       info->mag_filter : GEN6_MAPFILTER_LINEAR;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+          mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+          min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+}
+
+static uint32_t
+get_gen6_addr_controls(const struct ilo_dev *dev,
+                       enum gen_texcoord_mode tcx_ctrl,
+                       enum gen_texcoord_mode tcy_ctrl,
+                       enum gen_texcoord_mode tcz_ctrl)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      return tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT |
+             tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT |
+             tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT;
+   } else {
+      return tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT |
+             tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT |
+             tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT;
+   }
+}
+
+static uint32_t
+sampler_get_gen6_1d_addr_controls(const struct ilo_dev *dev,
+                                  const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode tcx_ctrl =
+      (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl;
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 100:
+    *
+    *     "If this field (TCY Address Control Mode) is set to
+    *      TEXCOORDMODE_CLAMP_BORDER or TEXCOORDMODE_HALF_BORDER and a 1D
+    *      surface is sampled, incorrect blending with the border color in the
+    *      vertical direction may occur."
+    */
+   const enum gen_texcoord_mode tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+   const enum gen_texcoord_mode tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint32_t
+sampler_get_gen6_2d_3d_addr_controls(const struct ilo_dev *dev,
+                                     const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode tcx_ctrl =
+      (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl;
+   const enum gen_texcoord_mode tcy_ctrl =
+      (info->tcy_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcy_ctrl;
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 108:
+    *
+    *     "[DevSNB]: if this field (TCZ Address Control Mode) is set to
+    *      TEXCOORDMODE_CLAMP_BORDER samples outside the map will clamp to 0
+    *      instead of boarder color"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 100:
+    *
+    *     "If this field is set to TEXCOORDMODE_CLAMP_BORDER for 3D maps on
+    *      formats without an alpha channel, samples straddling the map in the
+    *      Z direction may have their alpha channels off by 1."
+    *
+    * Do we want to do something here?
+    */
+   const enum gen_texcoord_mode tcz_ctrl =
+      (info->tcz_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcz_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint32_t
+sampler_get_gen6_cube_addr_controls(const struct ilo_dev *dev,
+                                    const struct ilo_state_sampler_info *info)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 99:
+    *
+    *     "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP
+    *      and TEXCOORDMODE_CUBE settings are valid, and each TC component
+    *      must have the same Address Control mode.
+    *
+    *      When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's
+    *      Cube Face Enable field must be programmed to 111111b (all faces
+    *      enabled)."
+    *
+    * From the Haswell PRM, volume 2d, page 278:
+    *
+    *     "When using cube map texture coordinates, each TC component must
+    *      have the same Address Control Mode.
+    *
+    *      When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's
+    *      Cube Face Enable field must be programmed to 111111b (all faces
+    *      enabled)."
+    *
+    * We always enable all cube faces and only need to make sure all address
+    * control modes are the same.
+    */
+   const enum gen_texcoord_mode tcx_ctrl =
+      (ilo_dev_gen(dev) >= ILO_GEN(7.5) ||
+       info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE ||
+       info->tcx_ctrl == GEN6_TEXCOORDMODE_CLAMP) ?
+      info->tcx_ctrl : GEN6_TEXCOORDMODE_CLAMP;
+   const enum gen_texcoord_mode tcy_ctrl = tcx_ctrl;
+   const enum gen_texcoord_mode tcz_ctrl = tcx_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint16_t
+get_gen6_lod_bias(const struct ilo_dev *dev, float bias)
+{
+   /* [-16.0, 16.0) in S4.6 or S4.8 */
+   const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6;
+   const float max = 16.0f;
+   const float scale = (float) (1 << fbits);
+   const int mask = (1 << (1 + 4 + fbits)) - 1;
+   const int scaled_max = (16 << fbits) - 1;
+   int scaled;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (bias > max)
+      bias = max;
+   else if (bias < -max)
+      bias = -max;
+
+   scaled = (int) (bias * scale);
+   if (scaled > scaled_max)
+      scaled = scaled_max;
+
+   return (scaled & mask);
+}
+
+static uint16_t
+get_gen6_lod_clamp(const struct ilo_dev *dev, float clamp)
+{
+   /* [0.0, 13.0] in U4.6 or [0.0, 14.0] in U4.8 */
+   const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6;
+   const float max = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 14.0f : 13.0f;
+   const float scale = (float) (1 << fbits);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (clamp > max)
+      clamp = max;
+   else if (clamp < 0.0f)
+      clamp = 0.0f;
+
+   return (int) (clamp * scale);
+}
+
+static bool
+sampler_set_gen6_SAMPLER_STATE(struct ilo_state_sampler *sampler,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_sampler_info *info)
+{
+   uint16_t lod_bias, max_lod, min_lod;
+   uint32_t dw0, dw1, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!sampler_validate_gen6_sampler(dev, info))
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 15:
+    *
+    *     "The per-pixel LOD is computed in an implementation-dependent manner
+    *      and approximates the log2 of the texel/pixel ratio at the given
+    *      pixel. The computation is typically based on the differential
+    *      texel-space distances associated with a one-pixel differential
+    *      distance along the screen x- and y-axes. These texel-space
+    *      distances are computed by evaluating neighboring pixel texture
+    *      coordinates, these coordinates being in units of texels on the base
+    *      MIP level (multiplied by the corresponding surface size in
+    *      texels)."
+    *
+    * Judging from the LOD computation pseudocode on page 16-18, the "base MIP
+    * level" should be given by SurfMinLod.  To summarize, for the "sample"
+    * message,
+    *
+    *   1) LOD is set to log2(texel/pixel ratio).  The number of texels is
+    *      measured against level SurfMinLod.
+    *   2) Bias is added to LOD.
+    *   3) if pre-clamp is enabled, LOD is clamped to [MinLod, MaxLod] first
+    *   4) LOD is compared with Base to determine whether magnification or
+    *      minification is needed.
+    *   5) If magnification is needed, or no mipmapping is requested, LOD is
+    *      set to floor(MinLod).
+    *   6) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD.
+    *
+    * As an example, we could set SurfMinLod to GL_TEXTURE_BASE_LEVEL and Base
+    * to 0 to match GL.  But GL expects LOD to be set to 0, instead of
+    * floor(MinLod), in 5).  Since this is only an issue when MinLod is
+    * greater than or equal to one, and, with Base being 0, a non-zero MinLod
+    * implies minification, we only need to deal with the case when mipmapping
+    * is disabled.  We can thus do:
+    *
+    *   if (MipFilter == MIPFILTER_NONE && MinLod) {
+    *     MinLod = 0;
+    *     MagFilter = MinFilter;
+    *   }
+    */
+
+   lod_bias = get_gen6_lod_bias(dev, info->lod_bias);
+   min_lod = get_gen6_lod_clamp(dev, info->min_lod);
+   max_lod = get_gen6_lod_clamp(dev, info->max_lod);
+
+   dw0 = GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE |
+         0 << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT |
+         info->mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+         info->mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+         info->min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw0 |= GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX10_OGL |
+             lod_bias << GEN7_SAMPLER_DW0_LOD_BIAS__SHIFT;
+
+      if (info->min_filter == GEN6_MAPFILTER_ANISOTROPIC ||
+          info->mag_filter == GEN6_MAPFILTER_ANISOTROPIC)
+         dw0 |= GEN7_SAMPLER_DW0_ANISO_ALGO_EWA;
+   } else {
+      dw0 |= lod_bias << GEN6_SAMPLER_DW0_LOD_BIAS__SHIFT |
+             info->shadow_func << GEN6_SAMPLER_DW0_SHADOW_FUNC__SHIFT;
+
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 102:
+       *
+       *     "(Min and Mag State Not Equal) Must be set to 1 if any of the
+       *      following are true:
+       *
+       *      - Mag Mode Filter and Min Mode Filter are not the same
+       *      - Address Rounding Enable: U address mag filter and U address
+       *        min filter are not the same
+       *      - Address Rounding Enable: V address mag filter and V address
+       *        min filter are not the same
+       *      - Address Rounding Enable: R address mag filter and R address
+       *        min filter are not the same"
+       *
+       * We set address rounding for U, V, and R uniformly.  Only need to
+       * check the filters.
+       */
+      if (info->min_filter != info->mag_filter)
+         dw0 |= GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL;
+   }
+
+   dw1 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 96:
+       *
+       *     "This field (Cube Surface Control Mode) must be set to
+       *      CUBECTRLMODE_PROGRAMMED"
+       */
+      dw1 |= min_lod << GEN7_SAMPLER_DW1_MIN_LOD__SHIFT |
+             max_lod << GEN7_SAMPLER_DW1_MAX_LOD__SHIFT |
+             info->shadow_func << GEN7_SAMPLER_DW1_SHADOW_FUNC__SHIFT |
+             GEN7_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED;
+   } else {
+      dw1 |= min_lod << GEN6_SAMPLER_DW1_MIN_LOD__SHIFT |
+             max_lod << GEN6_SAMPLER_DW1_MAX_LOD__SHIFT |
+             GEN6_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED |
+             info->tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT |
+             info->tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT |
+             info->tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT;
+   }
+
+   dw3 = info->max_anisotropy << GEN6_SAMPLER_DW3_MAX_ANISO__SHIFT;
+
+   /* round the coordinates for linear filtering */
+   if (info->min_filter != GEN6_MAPFILTER_NEAREST) {
+      dw3 |= GEN6_SAMPLER_DW3_U_MIN_ROUND |
+             GEN6_SAMPLER_DW3_V_MIN_ROUND |
+             GEN6_SAMPLER_DW3_R_MIN_ROUND;
+   }
+   if (info->mag_filter != GEN6_MAPFILTER_NEAREST) {
+      dw3 |= GEN6_SAMPLER_DW3_U_MAG_ROUND |
+             GEN6_SAMPLER_DW3_V_MAG_ROUND |
+             GEN6_SAMPLER_DW3_R_MAG_ROUND;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw3 |= GEN7_SAMPLER_DW3_TRIQUAL_FULL |
+             info->tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT |
+             info->tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT |
+             info->tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT;
+
+      if (info->non_normalized)
+         dw3 |= GEN7_SAMPLER_DW3_NON_NORMALIZED_COORD;
+   } else {
+      if (info->non_normalized)
+         dw3 |= GEN6_SAMPLER_DW3_NON_NORMALIZED_COORD;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(sampler->sampler) >= 3);
+   sampler->sampler[0] = dw0;
+   sampler->sampler[1] = dw1;
+   sampler->sampler[2] = dw3;
+
+   sampler->filter_integer = sampler_get_gen6_integer_filters(dev, info);
+   sampler->filter_3d = sampler_get_gen6_3d_filters(dev, info);
+   sampler->addr_ctrl_1d = sampler_get_gen6_1d_addr_controls(dev, info);
+   sampler->addr_ctrl_2d_3d = sampler_get_gen6_2d_3d_addr_controls(dev, info);
+   sampler->addr_ctrl_cube = sampler_get_gen6_cube_addr_controls(dev, info);
+
+   sampler->non_normalized = info->non_normalized;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 21:
+    *
+    *     "[DevSNB] Errata: Incorrect behavior is observed in cases where the
+    *      min and mag mode filters are different and SurfMinLOD is nonzero.
+    *      The determination of MagMode uses the following equation instead of
+    *      the one in the above pseudocode:
+    *
+    *      MagMode = (LOD + SurfMinLOD - Base <= 0)"
+    *
+    * As a way to work around that, request Base to be set to SurfMinLod.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
+       info->min_filter != info->mag_filter)
+      sampler->base_to_surf_min_lod = true;
+
+   return true;
+}
+
+static bool
+sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_sampler_border_info *info)
+{
+   uint32_t dw[12];
+   float rgba[4];
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 117:
+    *
+    *     "For ([DevSNB]), if border color is used, all formats must be
+    *      provided.  Hardware will choose the appropriate format based on
+    *      Surface Format and Texture Border Color Mode. The values
+    *      represented by each format should be the same (other than being
+    *      subject to range-based clamping and precision) to avoid unexpected
+    *      behavior."
+    *
+    * XXX We do not honor info->is_integer yet.
+    */
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /* make a copy so that we can clamp for SNORM and UNORM */
+   memcpy(rgba, info->rgba.f, sizeof(rgba));
+
+   /* IEEE_FP */
+   dw[1] = fui(rgba[0]);
+   dw[2] = fui(rgba[1]);
+   dw[3] = fui(rgba[2]);
+   dw[4] = fui(rgba[3]);
+
+   /* FLOAT_16 */
+   dw[5] = util_float_to_half(rgba[0]) |
+           util_float_to_half(rgba[1]) << 16;
+   dw[6] = util_float_to_half(rgba[2]) |
+           util_float_to_half(rgba[3]) << 16;
+
+   /* clamp to [-1.0f, 1.0f] */
+   rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f);
+   rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f);
+   rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f);
+   rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f);
+
+   /* SNORM16 */
+   dw[9] =  (int16_t) util_iround(rgba[0] * 32767.0f) |
+            (int16_t) util_iround(rgba[1] * 32767.0f) << 16;
+   dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) |
+            (int16_t) util_iround(rgba[3] * 32767.0f) << 16;
+
+   /* SNORM8 */
+   dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) |
+            (int8_t) util_iround(rgba[1] * 127.0f) << 8 |
+            (int8_t) util_iround(rgba[2] * 127.0f) << 16 |
+            (int8_t) util_iround(rgba[3] * 127.0f) << 24;
+
+   /* clamp to [0.0f, 1.0f] */
+   rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f);
+   rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f);
+   rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f);
+   rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f);
+
+   /* UNORM8 */
+   dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) |
+           (uint8_t) util_iround(rgba[1] * 255.0f) << 8 |
+           (uint8_t) util_iround(rgba[2] * 255.0f) << 16 |
+           (uint8_t) util_iround(rgba[3] * 255.0f) << 24;
+
+   /* UNORM16 */
+   dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) |
+           (uint16_t) util_iround(rgba[1] * 65535.0f) << 16;
+   dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) |
+           (uint16_t) util_iround(rgba[3] * 65535.0f) << 16;
+
+   STATIC_ASSERT(ARRAY_SIZE(border->color) >= 12);
+   memcpy(border->color, dw, sizeof(dw));
+
+   return true;
+}
+
+static bool
+sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_sampler_border_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 116:
+    *
+    *     "In DX10/OGL mode, the format of the border color is
+    *      R32G32B32A32_FLOAT, regardless of the surface format chosen."
+    *
+    * From the Haswell PRM, volume 2d, page 240:
+    *
+    *     "So, SW will have to program the table in SAMPLER_BORDER_COLOR_STATE
+    *      at offsets DWORD16 to 19, as per the integer surface format type."
+    *
+    * From the Broadwell PRM, volume 2d, page 297:
+    *
+    *     "DX10/OGL mode: the format of the border color depends on the format
+    *      of the surface being sampled. If the map format is UINT, then the
+    *      border color format is R32G32B32A32_UINT. If the map format is
+    *      SINT, then the border color format is R32G32B32A32_SINT. Otherwise,
+    *      the border color format is R32G32B32A32_FLOAT."
+    *
+    * XXX every Gen is different
+    */
+
+   STATIC_ASSERT(ARRAY_SIZE(border->color) >= 4);
+   memcpy(border->color, info->rgba.f, sizeof(info->rgba.f));
+
+   return true;
+}
+
+bool
+ilo_state_sampler_init(struct ilo_state_sampler *sampler,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sampler_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sampler, sizeof(*sampler)));
+
+   ret &= sampler_set_gen6_SAMPLER_STATE(sampler, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler,
+                                const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(ilo_is_zeroed(sampler, sizeof(*sampler)));
+
+   sampler->sampler[0] = GEN6_SAMPLER_DW0_DISABLE;
+   sampler->sampler[1] = 0;
+   sampler->sampler[2] = 0;
+
+   return true;
+}
+
+/**
+ * Modify \p sampler to work with \p surf.  There will be loss of information.
+ * Callers should make a copy of the orignal sampler first.
+ */
+bool
+ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_surface *surf)
+{
+   uint32_t addr_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (sampler->non_normalized) {
+      /* see sampler_validate_gen6_non_normalized() */
+      assert(surf->type == GEN6_SURFTYPE_2D ||
+             surf->type == GEN6_SURFTYPE_3D);
+      assert(!surf->min_lod && !surf->mip_count);
+   }
+
+   if (sampler->base_to_surf_min_lod) {
+      const uint8_t base = surf->min_lod << GEN6_SAMPLER_DW0_BASE_LOD__RADIX;
+
+      sampler->sampler[0] =
+         (sampler->sampler[0] & ~GEN6_SAMPLER_DW0_BASE_LOD__MASK) |
+         base << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT;
+   }
+
+   if (surf->is_integer || surf->type == GEN6_SURFTYPE_3D) {
+      const uint32_t mask = (GEN6_SAMPLER_DW0_MIP_FILTER__MASK |
+                             GEN6_SAMPLER_DW0_MIN_FILTER__MASK |
+                             GEN6_SAMPLER_DW0_MAG_FILTER__MASK);
+      const uint32_t filter = (surf->is_integer) ?
+         sampler->filter_integer : sampler->filter_3d;
+
+      assert((filter & mask) == filter);
+      sampler->sampler[0] = (sampler->sampler[0] & ~mask) |
+                            filter;
+   }
+
+   switch (surf->type) {
+   case GEN6_SURFTYPE_1D:
+      addr_ctrl = sampler->addr_ctrl_1d;
+      break;
+   case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_3D:
+      addr_ctrl = sampler->addr_ctrl_2d_3d;
+      break;
+   case GEN6_SURFTYPE_CUBE:
+      addr_ctrl = sampler->addr_ctrl_cube;
+      break;
+   default:
+      assert(!"unexpected surface type");
+      addr_ctrl = 0;
+      break;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      const uint32_t mask = (GEN7_SAMPLER_DW3_U_WRAP__MASK |
+                             GEN7_SAMPLER_DW3_V_WRAP__MASK |
+                             GEN7_SAMPLER_DW3_R_WRAP__MASK);
+
+      assert((addr_ctrl & mask) == addr_ctrl);
+      sampler->sampler[2] = (sampler->sampler[2] & ~mask) |
+                            addr_ctrl;
+   } else {
+      const uint32_t mask = (GEN6_SAMPLER_DW1_U_WRAP__MASK |
+                             GEN6_SAMPLER_DW1_V_WRAP__MASK |
+                             GEN6_SAMPLER_DW1_R_WRAP__MASK);
+
+      assert((addr_ctrl & mask) == addr_ctrl);
+      sampler->sampler[1] = (sampler->sampler[1] & ~mask) |
+                            addr_ctrl;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_sampler_border_init(struct ilo_state_sampler_border *border,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_border_info *info)
+{
+   bool ret = true;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(border,
+            dev, info);
+   } else {
+      ret &= sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(border,
+            dev, info);
+   }
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.h b/src/gallium/drivers/ilo/core/ilo_state_sampler.h
new file mode 100644
index 00000000000..75c7620a678
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_SAMPLER_H
+#define ILO_STATE_SAMPLER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_state_surface;
+
+struct ilo_state_sampler_info {
+   bool non_normalized;
+
+   float lod_bias;
+   float min_lod;
+   float max_lod;
+
+   enum gen_mip_filter mip_filter;
+   enum gen_map_filter min_filter;
+   enum gen_map_filter mag_filter;
+   enum gen_aniso_ratio max_anisotropy;
+
+   enum gen_texcoord_mode tcx_ctrl;
+   enum gen_texcoord_mode tcy_ctrl;
+   enum gen_texcoord_mode tcz_ctrl;
+
+   enum gen_prefilter_op shadow_func;
+};
+
+struct ilo_state_sampler_border_info {
+   union {
+      float f[4];
+      uint32_t ui[4];
+   } rgba;
+
+   bool is_integer;
+};
+
+struct ilo_state_sampler {
+   uint32_t sampler[3];
+
+   uint32_t filter_integer;
+   uint32_t filter_3d;
+
+   uint32_t addr_ctrl_1d;
+   uint32_t addr_ctrl_2d_3d;
+   uint32_t addr_ctrl_cube;
+
+   bool non_normalized;
+   bool base_to_surf_min_lod;
+};
+
+struct ilo_state_sampler_border {
+   uint32_t color[12];
+};
+
+bool
+ilo_state_sampler_init(struct ilo_state_sampler *sampler,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sampler_info *info);
+
+bool
+ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler,
+                                const struct ilo_dev *dev);
+
+bool
+ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_surface *surf);
+
+bool
+ilo_state_sampler_border_init(struct ilo_state_sampler_border *border,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_border_info *info);
+
+#endif /* ILO_STATE_SAMPLER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
new file mode 100644
index 00000000000..5d1d400acdd
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
@@ -0,0 +1,350 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_sbe.h"
+
+static bool
+sbe_validate_gen8(const struct ilo_dev *dev,
+                  const struct ilo_state_sbe_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->attr_count <= ILO_STATE_SBE_MAX_ATTR_COUNT);
+
+   assert(info->vue_read_base + info->vue_read_count <=
+         info->cv_vue_attr_count);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "(Vertex URB Entry Read Length)
+    *      Format: U5
+    *      Range [1,16]
+    *
+    *      Specifies the amount of URB data read for each Vertex URB entry, in
+    *      256-bit register increments.
+    *
+    *      Programming Notes
+    *      It is UNDEFINED to set this field to 0 indicating no Vertex URB
+    *      data to be read."
+    *
+    *     "(Vertex URB Entry Read Offset)
+    *      Format: U6
+    *      Range [0,63]
+    *
+    *      Specifies the offset (in 256-bit units) at which Vertex URB data is
+    *      to be read from the URB."
+    */
+   assert(info->vue_read_base % 2 == 0 && info->vue_read_base <= 126);
+   assert(info->vue_read_count <= 32);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 268:
+    *
+    *     "This field (Point Sprite Texture Coordinate Enable) must be
+    *      programmed to 0 when non-point primitives are rendered."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->point_sprite_enables)
+      assert(info->cv_is_point);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 246:
+    *
+    *     "(Number of SF Output Attributes) 33-48: Specifies 17-32 attributes
+    *      (# attributes = field value - 16). Swizzling performed on
+    *      Attributes 16-31 (as required) only. Attributes 0-15 passed through
+    *      unmodified.
+    *
+    *      Note :
+    *
+    *      Attribute n Component Override and Constant Source states apply to
+    *      Attributes 16-31 (as required) instead of Attributes 0-15. E.g.,
+    *      this allows an Attribute 16-31 component to be overridden with the
+    *      PrimitiveID value.
+    *
+    *      Attribute n WrapShortest Enables still apply to Attributes 0-15.
+    *
+    *      Attribute n Swizzle Select and Attribute n Source Attribute states
+    *      are ignored and none of the swizzling functions available through
+    *      these controls are performed."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 247:
+    *
+    *     "This bit (Attribute Swizzle Enable) controls the use of the
+    *      Attribute n Swizzle Select and Attribute n Source Attribute fields
+    *      only. If ENABLED, those fields are used as described below. If
+    *      DISABLED, attributes are copied from their corresponding source
+    *      attributes, for the purposes of Swizzle Select only.
+    *
+    *      Note that the following fields are unaffected by this bit, and are
+    *      therefore always used to control their respective fields:
+    *      Attribute n Component Override X/Y/Z/W
+    *      Attribute n Constant Source
+    *      Attribute n WrapShortest Enables"
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 264:
+    *
+    *     "When Attribute Swizzle Enable is ENABLED, this bit (Attribute
+    *      Swizzle Control Mode) controls whether attributes 0-15 or 16-31 are
+    *      subject to the following swizzle controls:
+    *
+    *      - Attribute n Component Override X/Y/Z/W
+    *      - Attribute n Constant Source
+    *      - Attribute n Swizzle Select
+    *      - Attribute n Source Attribute
+    *      - Attribute n Wrap Shortest Enables"
+    *
+    *     "SWIZ_16_31... Only valid when 16 or more attributes are output."
+    */
+   assert(info->swizzle_count <= ILO_STATE_SBE_MAX_SWIZZLE_COUNT);
+   if (info->swizzle_16_31) {
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7) &&
+             info->swizzle_enable &&
+             info->attr_count > 16);
+   }
+
+   return true;
+}
+
+static uint8_t
+sbe_get_gen8_min_read_count(const struct ilo_dev *dev,
+                            const struct ilo_state_sbe_info *info)
+{
+   uint8_t min_count = 0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* minimum read count for non-swizzled attributes */
+   if (!info->swizzle_enable || info->swizzle_count < info->attr_count) {
+      if (info->swizzle_16_31 && info->swizzle_count + 16 == info->attr_count)
+         min_count = 16;
+      else
+         min_count = info->attr_count;
+   }
+
+   if (info->swizzle_enable) {
+      uint8_t i;
+
+      for (i = 0; i < info->swizzle_count; i++) {
+         const struct ilo_state_sbe_swizzle_info *swizzle =
+            &info->swizzles[i];
+         bool inputattr_facing;
+
+         switch (swizzle->attr_select) {
+         case GEN6_INPUTATTR_FACING:
+         case GEN6_INPUTATTR_FACING_W:
+            inputattr_facing = true;
+            break;
+         default:
+            inputattr_facing = false;
+            break;
+         }
+
+         if (min_count < swizzle->attr + inputattr_facing + 1)
+            min_count = swizzle->attr + inputattr_facing + 1;
+      }
+   }
+
+   return min_count;
+}
+
+static uint8_t
+sbe_get_gen8_read_length(const struct ilo_dev *dev,
+                         const struct ilo_state_sbe_info *info)
+{
+   uint8_t read_len;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "(Vertex URB Entry Read Length)
+    *      This field should be set to the minimum length required to read the
+    *      maximum source attribute. The maximum source attribute is indicated
+    *      by the maximum value of the enabled Attribute # Source Attribute if
+    *      Attribute Swizzle Enable is set, Number of Output Attributes -1 if
+    *      enable is not set.
+    *      read_length = ceiling((max_source_attr+1)/2)
+    *
+    *      [errata] Corruption/Hang possible if length programmed larger than
+    *      recommended"
+    */
+   if (info->has_min_read_count) {
+      read_len = info->vue_read_count;
+      assert(read_len == sbe_get_gen8_min_read_count(dev, info));
+   } else {
+      read_len = sbe_get_gen8_min_read_count(dev, info);
+      assert(read_len <= info->vue_read_count);
+   }
+
+   /*
+    * In pairs.  URB entries are aligned to 1024-bits or 512-bits.  There is
+    * no need to worry about reading past entries.
+    */
+   read_len = (read_len + 1) / 2;
+   if (!read_len)
+      read_len = 1;
+
+   return read_len;
+}
+
+static bool
+sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_sbe_info *info)
+{
+   uint8_t vue_read_offset, vue_read_len;
+   uint8_t attr_count;
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!sbe_validate_gen8(dev, info))
+      return false;
+
+   vue_read_offset = info->vue_read_base / 2;
+   vue_read_len = sbe_get_gen8_read_length(dev, info);
+
+   attr_count = info->attr_count;
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && info->swizzle_16_31)
+      attr_count += 16;
+
+   dw1 = attr_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT |
+         vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN |
+             GEN8_SBE_DW1_USE_URB_READ_OFFSET |
+             vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT;
+   } else {
+      dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->swizzle_16_31)
+      dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_16_31;
+
+   if (info->swizzle_enable)
+      dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE;
+
+   dw1 |= (info->point_sprite_origin_lower_left) ?
+      GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT :
+      GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT;
+
+   dw2 = info->point_sprite_enables;
+   dw3 = info->const_interp_enables;
+
+   STATIC_ASSERT(ARRAY_SIZE(sbe->sbe) >= 3);
+   sbe->sbe[0] = dw1;
+   sbe->sbe[1] = dw2;
+   sbe->sbe[2] = dw3;
+
+   return true;
+}
+
+static bool
+sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sbe_info *info)
+{
+   uint16_t swiz[ILO_STATE_SBE_MAX_SWIZZLE_COUNT];
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->swizzle_count; i++) {
+      const struct ilo_state_sbe_swizzle_info *swizzle = &info->swizzles[i];
+
+      /* U5 */
+      assert(swizzle->attr < 32);
+      swiz[i] = swizzle->attr_select << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT |
+                swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT;
+
+      if (swizzle->force_zeros) {
+         swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W |
+                    GEN8_SBE_SWIZ_OVERRIDE_Z |
+                    GEN8_SBE_SWIZ_OVERRIDE_Y |
+                    GEN8_SBE_SWIZ_OVERRIDE_X |
+                    GEN8_SBE_SWIZ_CONST_0000;
+      }
+   }
+
+   for (; i < ARRAY_SIZE(swiz); i++) {
+      swiz[i] = GEN6_INPUTATTR_NORMAL << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT |
+                i << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT;
+   }
+
+   STATIC_ASSERT(sizeof(sbe->swiz) == sizeof(swiz));
+   memcpy(sbe->swiz, swiz, sizeof(swiz));
+
+   return true;
+}
+
+bool
+ilo_state_sbe_init(struct ilo_state_sbe *sbe,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sbe_info *info)
+{
+   assert(ilo_is_zeroed(sbe, sizeof(*sbe)));
+   return ilo_state_sbe_set_info(sbe, dev, info);
+}
+
+bool
+ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe,
+                                const struct ilo_dev *dev,
+                                uint8_t read_base,
+                                uint8_t read_count)
+{
+   struct ilo_state_sbe_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.attr_count = read_count;
+   info.cv_vue_attr_count = read_base + read_count;
+   info.vue_read_base = read_base;
+   info.vue_read_count = read_count;
+   info.has_min_read_count = true;
+
+   return ilo_state_sbe_set_info(sbe, dev, &info);
+}
+
+bool
+ilo_state_sbe_set_info(struct ilo_state_sbe *sbe,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sbe_info *info)
+{
+   bool ret = true;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ret &= sbe_set_gen8_3DSTATE_SBE(sbe, dev, info);
+   ret &= sbe_set_gen8_3DSTATE_SBE_SWIZ(sbe, dev, info);
+
+   assert(ret);
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.h b/src/gallium/drivers/ilo/core/ilo_state_sbe.h
new file mode 100644
index 00000000000..122999a9e94
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_SBE_H
+#define ILO_STATE_SBE_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 264:
+ *
+ *     "Number of SF Output Attributes sets the number of attributes that will
+ *      be output from the SF stage, not including position. This can be used
+ *      to specify up to 32, and may differ from the number of input
+ *      attributes."
+ *
+ *     "The first or last set of 16 attributes can be swizzled according to
+ *      certain state fields."
+ */
+#define ILO_STATE_SBE_MAX_ATTR_COUNT 32
+#define ILO_STATE_SBE_MAX_SWIZZLE_COUNT 16
+
+struct ilo_state_sbe_swizzle_info {
+   /* select an attribute from read ones */
+   enum gen_inputattr_select attr_select;
+   uint8_t attr;
+
+   bool force_zeros;
+};
+
+struct ilo_state_sbe_info {
+   uint8_t attr_count;
+
+   /* which VUE attributes to read */
+   uint8_t cv_vue_attr_count;
+   uint8_t vue_read_base;
+   uint8_t vue_read_count;
+   bool has_min_read_count;
+
+   bool cv_is_point;
+   bool point_sprite_origin_lower_left;
+   /* force sprite coordinates to the four corner vertices of the point */
+   uint32_t point_sprite_enables;
+
+   /* force attr at the provoking vertex to a0 and zero to a1/a2 */
+   uint32_t const_interp_enables;
+
+   bool swizzle_enable;
+   /* swizzle attribute 16 to 31 instead; Gen7+ only */
+   bool swizzle_16_31;
+   uint8_t swizzle_count;
+   const struct ilo_state_sbe_swizzle_info *swizzles;
+};
+
+struct ilo_state_sbe {
+   uint32_t sbe[3];
+   uint32_t swiz[8];
+};
+
+bool
+ilo_state_sbe_init(struct ilo_state_sbe *sbe,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sbe_info *info);
+
+bool
+ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe,
+                                const struct ilo_dev *dev,
+                                uint8_t read_base,
+                                uint8_t read_count);
+
+bool
+ilo_state_sbe_set_info(struct ilo_state_sbe *sbe,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sbe_info *info);
+
+#endif /* ILO_STATE_SBE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.c b/src/gallium/drivers/ilo/core/ilo_state_shader.c
new file mode 100644
index 00000000000..f67326c7f10
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.c
@@ -0,0 +1,737 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_shader.h"
+
+enum vertex_stage {
+   STAGE_VS,
+   STAGE_HS,
+   STAGE_DS,
+   STAGE_GS,
+};
+
+struct vertex_ff {
+   uint8_t grf_start;
+   uint8_t scratch_space;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+   bool has_uav;
+
+   uint8_t vue_read_offset;
+   uint8_t vue_read_len;
+
+   uint8_t user_clip_enables;
+};
+
+static bool
+vertex_validate_gen6_kernel(const struct ilo_dev *dev,
+                            enum vertex_stage stage,
+                            const struct ilo_state_shader_kernel_info *kernel)
+{
+   /*
+    * "Dispatch GRF Start Register for URB Data" is U4 for GS and U5 for
+    * others.
+    */
+   const uint8_t max_grf_start = (stage == STAGE_GS) ? 16 : 32;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 134:
+    *
+    *     "(Per-Thread Scratch Space)
+    *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
+    */
+   const uint32_t max_scratch_size = 2 * 1024 * 1024;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* we do not want to save it */
+   assert(!kernel->offset);
+
+   assert(kernel->grf_start < max_grf_start);
+   assert(kernel->scratch_size <= max_scratch_size);
+
+   return true;
+}
+
+static bool
+vertex_validate_gen6_urb(const struct ilo_dev *dev,
+                         enum vertex_stage stage,
+                         const struct ilo_state_shader_urb_info *urb)
+{
+   /* "Vertex/Patch URB Entry Read Offset" is U6, in pairs */
+   const uint8_t max_read_base = 63 * 2;
+   /*
+    * "Vertex/Patch URB Entry Read Length" is limited to 64 for DS and U6 for
+    * others, in pairs
+    */
+   const uint8_t max_read_count = ((stage == STAGE_DS) ? 64 : 63) * 2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(urb->read_base + urb->read_count <= urb->cv_input_attr_count);
+
+   assert(urb->read_base % 2 == 0 && urb->read_base <= max_read_base);
+
+   /*
+    * There is no need to worry about reading past entries, as URB entries are
+    * aligned to 1024-bits (Gen6) or 512-bits (Gen7+).
+    */
+   assert(urb->read_count <= max_read_count);
+
+   return true;
+}
+
+static bool
+vertex_get_gen6_ff(const struct ilo_dev *dev,
+                   enum vertex_stage stage,
+                   const struct ilo_state_shader_kernel_info *kernel,
+                   const struct ilo_state_shader_resource_info *resource,
+                   const struct ilo_state_shader_urb_info *urb,
+                   struct vertex_ff *ff)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_validate_gen6_kernel(dev, stage, kernel) ||
+       !vertex_validate_gen6_urb(dev, stage, urb))
+      return false;
+
+   ff->grf_start = kernel->grf_start;
+   /* next power of two, starting from 1KB */
+   ff->scratch_space = (kernel->scratch_size > 1024) ?
+      (util_last_bit(kernel->scratch_size - 1) - 10): 0;
+
+   ff->sampler_count = (resource->sampler_count <= 12) ?
+      (resource->sampler_count + 3) / 4 : 4;
+   ff->surface_count = resource->surface_count;
+   ff->has_uav = resource->has_uav;
+
+   ff->vue_read_offset = urb->read_base / 2;
+   ff->vue_read_len = (urb->read_count + 1) / 2;
+
+   /* need to read something unless VUE handles are included */
+   switch (stage) {
+   case STAGE_VS:
+      if (!ff->vue_read_len)
+         ff->vue_read_len = 1;
+
+      /* one GRF per attribute */
+      assert(kernel->grf_start + urb->read_count * 2 <= 128);
+      break;
+   case STAGE_GS:
+      if (ilo_dev_gen(dev) == ILO_GEN(6) && !ff->vue_read_len)
+         ff->vue_read_len = 1;
+      break;
+   default:
+      break;
+   }
+
+   ff->user_clip_enables = urb->user_clip_enables;
+
+   return true;
+}
+
+static uint16_t
+vs_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_vs_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_VS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 280 : 70;
+      break;
+   case ILO_GEN(7):
+   case ILO_GEN(6):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = vs_get_gen6_thread_count(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 |= thread_count << GEN75_VS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 |= thread_count << GEN6_VS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN6_VS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN6_VS_DW5_VS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(vs->vs) >= 5);
+   vs->vs[0] = dw2;
+   vs->vs[1] = dw3;
+   vs->vs[2] = dw4;
+   vs->vs[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      vs->vs[4] = ff.user_clip_enables << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+static uint16_t
+hs_get_gen7_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_hs_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_HS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 256 : 70;
+      break;
+   case ILO_GEN(7):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_hs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw1, dw2, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = hs_get_gen7_thread_count(dev, info);
+
+   dw1 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw1 |= thread_count << GEN75_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
+   else
+      dw1 |= thread_count << GEN7_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
+
+   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
+
+   if (info->dispatch_enable)
+      dw2 |= GEN7_HS_DW2_HS_ENABLE;
+   if (info->stats_enable)
+      dw2 |= GEN7_HS_DW2_STATISTICS;
+
+   dw4 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw5 = GEN7_HS_DW5_INCLUDE_VERTEX_HANDLES |
+         ff.grf_start << GEN7_HS_DW5_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN7_HS_DW5_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN7_HS_DW5_URB_READ_OFFSET__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw5 |= GEN75_HS_DW5_ACCESS_UAV;
+
+   STATIC_ASSERT(ARRAY_SIZE(hs->hs) >= 4);
+   hs->hs[0] = dw1;
+   hs->hs[1] = dw2;
+   hs->hs[2] = dw4;
+   hs->hs[3] = dw5;
+
+   return true;
+}
+
+static bool
+ds_set_gen7_3DSTATE_TE(struct ilo_state_ds *ds,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ds_info *info)
+{
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   dw1 = 0;
+
+   if (info->dispatch_enable) {
+      dw1 |= GEN7_TE_DW1_MODE_HW |
+             GEN7_TE_DW1_TE_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(ds->te) >= 3);
+   ds->te[0] = dw1;
+   ds->te[1] = fui(63.0f);
+   ds->te[2] = fui(64.0f);
+
+   return true;
+}
+
+static uint16_t
+ds_get_gen7_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_ds_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_DS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 280 : 70;
+      break;
+   case ILO_GEN(7):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ds_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = ds_get_gen7_thread_count(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.grf_start << GEN7_DS_DW4_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN7_DS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN7_DS_DW4_URB_READ_OFFSET__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 |= thread_count << GEN75_DS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 |= thread_count << GEN7_DS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN7_DS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN7_DS_DW5_DS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(ds->ds) >= 5);
+   ds->ds[0] = dw2;
+   ds->ds[1] = dw3;
+   ds->ds[2] = dw4;
+   ds->ds[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ds->ds[4] = ff.user_clip_enables << GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+static bool
+gs_get_gen6_ff(const struct ilo_dev *dev,
+               const struct ilo_state_gs_info *info,
+               struct vertex_ff *ff)
+{
+   const struct ilo_state_shader_urb_info *urb = &info->urb;
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel,
+            &info->resource, &info->urb, ff))
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 168-169:
+    *
+    *     "[0,62] indicating [1,63] 16B units"
+    *
+    *     "Programming Restrictions: The vertex size must be programmed as a
+    *      multiple of 32B units with the following exception: Rendering is
+    *      disabled (as per SOL stage state) and the vertex size output by the
+    *      GS thread is 16B.
+    *
+    *      If rendering is enabled (as per SOL state) the vertex size must be
+    *      programmed as a multiple of 32B units. In other words, the only
+    *      time software can program a vertex size with an odd number of 16B
+    *      units is when rendering is disabled."
+    */
+   assert(urb->output_attr_count <= 63);
+   if (!sol->render_disable)
+      assert(urb->output_attr_count % 2 == 0);
+
+   return true;
+}
+
+static uint16_t
+gs_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_GS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 256 : 70;
+      break;
+   case ILO_GEN(7):
+   case ILO_GEN(6):
+   default:
+      thread_count = dev->thread_count;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 154:
+       *
+       *     "Maximum Number of Threads valid range is [0,27] when Rendering
+       *      Enabled bit is set."
+       *
+       * According to the classic driver, [0, 20] for GT1.
+       */
+      if (!sol->render_disable)
+         thread_count = (dev->gt == 2) ? 27 : 20;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!gs_get_gen6_ff(dev, info, &ff))
+      return false;
+
+   thread_count = gs_get_gen6_thread_count(dev, info);
+
+   dw2 = GEN6_THREADDISP_SPF |
+         ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
+         ff.grf_start << GEN6_GS_DW4_URB_GRF_START__SHIFT;
+
+   dw5 = thread_count << GEN6_GS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN6_GS_DW5_STATISTICS;
+   if (sol->stats_enable)
+      dw5 |= GEN6_GS_DW5_SO_STATISTICS;
+   if (!sol->render_disable)
+      dw5 |= GEN6_GS_DW5_RENDER_ENABLE;
+
+   dw6 = 0;
+
+   /* GEN7_REORDER_TRAILING is handled by the kernel */
+   if (sol->tristrip_reorder == GEN7_REORDER_LEADING)
+      dw6 |= GEN6_GS_DW6_REORDER_LEADING_ENABLE;
+
+   if (sol->sol_enable) {
+      dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE;
+
+      if (sol->svbi_post_inc) {
+         dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE |
+                sol->svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT;
+      }
+   }
+
+   if (info->dispatch_enable)
+      dw6 |= GEN6_GS_DW6_GS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5);
+   gs->gs[0] = dw2;
+   gs->gs[1] = dw3;
+   gs->gs[2] = dw4;
+   gs->gs[3] = dw5;
+   gs->gs[4] = dw6;
+
+   return true;
+}
+
+static uint8_t
+gs_get_gen7_vertex_size(const struct ilo_dev *dev,
+                        const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_shader_urb_info *urb = &info->urb;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   return (urb->output_attr_count) ? urb->output_attr_count - 1 : 0;
+}
+
+static bool
+gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_gs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint8_t vertex_size;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!gs_get_gen6_ff(dev, info, &ff))
+      return false;
+
+   thread_count = gs_get_gen6_thread_count(dev, info);
+   vertex_size = gs_get_gen7_vertex_size(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = vertex_size << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT |
+         0 << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT |
+         ff.vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT |
+         GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES |
+         ff.vue_read_offset << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT |
+         ff.grf_start << GEN7_GS_DW4_URB_GRF_START__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 = thread_count << GEN75_GS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 = thread_count << GEN7_GS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN7_GS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN7_GS_DW5_GS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5);
+   gs->gs[0] = dw2;
+   gs->gs[1] = dw3;
+   gs->gs[2] = dw4;
+   gs->gs[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      gs->gs[4] = ff.user_clip_enables << GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+bool
+ilo_state_vs_init(struct ilo_state_vs *vs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(vs, sizeof(*vs)));
+
+   ret &= vs_set_gen6_3DSTATE_VS(vs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_vs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_vs_init(vs, dev, &info);
+}
+
+bool
+ilo_state_hs_init(struct ilo_state_hs *hs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_hs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(hs, sizeof(*hs)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= hs_set_gen7_3DSTATE_HS(hs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_hs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_hs_init(hs, dev, &info);
+}
+
+bool
+ilo_state_ds_init(struct ilo_state_ds *ds,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ds_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(ds, sizeof(*ds)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= ds_set_gen7_3DSTATE_TE(ds, dev, info);
+      ret &= ds_set_gen7_3DSTATE_DS(ds, dev, info);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_ds_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_ds_init(ds, dev, &info);
+}
+
+bool
+ilo_state_gs_init(struct ilo_state_gs *gs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_gs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(gs, sizeof(*gs)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= gs_set_gen7_3DSTATE_GS(gs, dev, info);
+   else
+      ret &= gs_set_gen6_3DSTATE_GS(gs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_gs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_gs_init(gs, dev, &info);
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.h b/src/gallium/drivers/ilo/core/ilo_state_shader.h
new file mode 100644
index 00000000000..44690c5b0bb
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.h
@@ -0,0 +1,256 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_SHADER_H
+#define ILO_STATE_SHADER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/**
+ * Kernel information.
+ */
+struct ilo_state_shader_kernel_info {
+   /* usually 0 unless the shader has multiple kernels */
+   uint32_t offset;
+
+   uint8_t grf_start;
+   uint8_t pcb_attr_count;
+
+   uint32_t scratch_size;
+};
+
+/**
+ * Shader resources.
+ */
+struct ilo_state_shader_resource_info {
+   /* for prefetches */
+   uint8_t sampler_count;
+   uint8_t surface_count;
+
+   bool has_uav;
+};
+
+/**
+ * URB inputs/outputs.
+ */
+struct ilo_state_shader_urb_info {
+   uint8_t cv_input_attr_count;
+
+   uint8_t read_base;
+   uint8_t read_count;
+
+   uint8_t output_attr_count;
+
+   uint8_t user_cull_enables;
+   uint8_t user_clip_enables;
+};
+
+struct ilo_state_vs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_hs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_ds_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+/**
+ * Stream output.  Must be consistent with ilo_state_sol_info.
+ */
+struct ilo_state_gs_sol_info {
+   bool sol_enable;
+   bool stats_enable;
+   bool render_disable;
+
+   uint16_t svbi_post_inc;
+
+   enum gen_reorder_mode tristrip_reorder;
+};
+
+struct ilo_state_gs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   struct ilo_state_gs_sol_info sol;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_ps_io_info {
+   /* inputs */
+   enum gen_position_offset posoffset;
+   uint8_t attr_count;
+   bool use_z;
+   bool use_w;
+   bool use_coverage_mask;
+
+   /* outputs */
+   enum gen_pscdepth_mode pscdepth;
+   bool has_rt_write;
+   bool write_pixel_mask;
+   bool write_omask;
+};
+
+struct ilo_state_ps_params_info {
+   /* compatibility with raster states */
+   uint32_t sample_mask;
+   bool earlyz_control_psexec;
+
+   /* compatibility with cc states */
+   bool alpha_may_kill;
+   bool dual_source_blending;
+   bool has_writeable_rt;
+};
+
+struct ilo_state_ps_info {
+   struct ilo_state_shader_kernel_info kernel_8;
+   struct ilo_state_shader_kernel_info kernel_16;
+   struct ilo_state_shader_kernel_info kernel_32;
+   struct ilo_state_shader_resource_info resource;
+
+   struct ilo_state_ps_io_info io;
+   struct ilo_state_ps_params_info params;
+
+   /* bitmask of GEN6_PS_DISPATCH_x */
+   uint8_t valid_kernels;
+   bool per_sample_dispatch;
+   bool sample_count_one;
+   bool cv_per_sample_interp;
+   bool cv_has_earlyz_op;
+
+   bool rt_clear_enable;
+   bool rt_resolve_enable;
+
+   bool cv_has_depth_buffer;
+};
+
+struct ilo_state_vs {
+   uint32_t vs[5];
+};
+
+struct ilo_state_hs {
+   uint32_t hs[4];
+};
+
+struct ilo_state_ds {
+   uint32_t te[3];
+   uint32_t ds[5];
+};
+
+struct ilo_state_gs {
+   uint32_t gs[5];
+};
+
+struct ilo_state_ps {
+   uint32_t ps[8];
+
+   struct ilo_state_ps_dispatch_conds {
+      bool ps_valid;
+
+      bool has_rt_write;
+      bool write_odepth;
+      bool write_ostencil;
+      bool has_uav_write;
+      bool ps_may_kill;
+   } conds;
+};
+
+bool
+ilo_state_vs_init(struct ilo_state_vs *vs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vs_info *info);
+
+bool
+ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_hs_init(struct ilo_state_hs *hs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_hs_info *info);
+
+bool
+ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
+                           const struct ilo_dev *dev);
+
+
+bool
+ilo_state_ds_init(struct ilo_state_ds *ds,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ds_info *info);
+
+bool
+ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_gs_init(struct ilo_state_gs *gs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_gs_info *info);
+
+bool
+ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info);
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params);
+
+#endif /* ILO_STATE_SHADER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
new file mode 100644
index 00000000000..f4d801e9b56
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
@@ -0,0 +1,771 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_shader.h"
+
+struct pixel_ff {
+   uint8_t dispatch_modes;
+
+   uint32_t kernel_offsets[3];
+   uint8_t grf_starts[3];
+   bool pcb_enable;
+   uint8_t scratch_space;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+   bool has_uav;
+
+   uint16_t thread_count;
+
+   struct ilo_state_ps_dispatch_conds conds;
+
+   bool kill_pixel;
+   bool dispatch_enable;
+   bool dual_source_blending;
+   uint32_t sample_mask;
+};
+
+static bool
+ps_kernel_validate_gen6(const struct ilo_dev *dev,
+                        const struct ilo_state_shader_kernel_info *kernel)
+{
+   /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */
+   const uint8_t max_grf_start = 128;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 271:
+    *
+    *     "(Per-Thread Scratch Space)
+    *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
+    */
+   const uint32_t max_scratch_size = 2 * 1024 * 1024;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* "Kernel Start Pointer" is 64-byte aligned */
+   assert(kernel->offset % 64 == 0);
+
+   assert(kernel->grf_start < max_grf_start);
+   assert(kernel->scratch_size <= max_scratch_size);
+
+   return true;
+}
+
+static bool
+ps_validate_gen6(const struct ilo_dev *dev,
+                 const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   const struct ilo_state_ps_io_info *io = &info->io;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!ps_kernel_validate_gen6(dev, kernel_8) ||
+       !ps_kernel_validate_gen6(dev, kernel_16) ||
+       !ps_kernel_validate_gen6(dev, kernel_32))
+      return false;
+
+   /* unsupported on Gen6 */
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      assert(!io->use_coverage_mask);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth
+    *      field must be set to disabled."
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && io->pscdepth != GEN7_PSCDEPTH_OFF)
+      assert(info->cv_has_depth_buffer);
+
+   if (!info->per_sample_dispatch) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 281:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      POSOFFSET_SAMPLE."
+       */
+      assert(io->posoffset != GEN6_POSOFFSET_SAMPLE);
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 282:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      INTERP_SAMPLE."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 283:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+       *      Sample or Non-perspective Sample barycentric coordinates."
+       */
+      assert(!info->cv_per_sample_interp);
+   }
+
+   /*
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "Pixel Shader Dispatch, Alpha... must all be disabled."
+    *
+    * Simply disallow any valid kernel when there is early-z op.  Also, when
+    * there is no valid kernel, io should be zeroed.
+    */
+   if (info->valid_kernels)
+      assert(!info->cv_has_earlyz_op);
+   else
+      assert(ilo_is_zeroed(io, sizeof(*io)));
+
+   return true;
+}
+
+static uint8_t
+ps_get_gen6_dispatch_modes(const struct ilo_dev *dev,
+                           const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint8_t dispatch_modes = info->valid_kernels;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!dispatch_modes)
+      return 0;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 334:
+    *
+    *     "Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
+    *      computed depth."
+    *
+    *     "Valid on all products, except when in non-1x PERSAMPLE mode
+    *      (applies to [DevSNB+] only)"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 239:
+    *
+    *     "[DevSNB]: When Pixel Shader outputs oDepth and PS invocation mode
+    *      is PERPIXEL, Message Type for Render Target Write must be SIMD8.
+    *
+    *      Errata: [DevSNB+]: When Pixel Shader outputs oMask, this message
+    *      type is not supported: SIMD8 (including SIMD8_DUALSRC_xx)."
+    *
+    * It is really hard to follow what combinations are valid on what
+    * platforms.  Judging from the restrictions on RT write messages on Gen6,
+    * oDepth and oMask related issues should be Gen6-specific.  PERSAMPLE
+    * issue should be universal, and disallows multiple dispatch modes.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+      if (io->pscdepth != GEN7_PSCDEPTH_OFF && !info->per_sample_dispatch)
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+      if (io->write_omask)
+         dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+   }
+   if (info->per_sample_dispatch && !info->sample_count_one) {
+      /* prefer 32 over 16 over 8 */
+      if (dispatch_modes & GEN6_PS_DISPATCH_32)
+         dispatch_modes &= GEN6_PS_DISPATCH_32;
+      else if (dispatch_modes & GEN6_PS_DISPATCH_16)
+         dispatch_modes &= GEN6_PS_DISPATCH_16;
+      else
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+   }
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 149:
+    *
+    *     "When Render Target Fast Clear Enable is ENABLED or Render Target
+    *      Resolve Type = RESOLVE_PARTIAL or RESOLVE_FULL, this bit (8 Pixel
+    *      Dispatch or Dual-8 Pixel Dispatch Enable) must be DISABLED."
+    */
+   if (info->rt_clear_enable || info->rt_resolve_enable)
+      dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+
+   assert(dispatch_modes);
+
+   return dispatch_modes;
+}
+
+static uint16_t
+ps_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_ps_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_PS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      /* scaled automatically */
+      thread_count = 64 - 1;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt == 3) ? 408 :
+                     (dev->gt == 2) ? 204 : 102;
+      break;
+   case ILO_GEN(7):
+      thread_count = (dev->gt == 2) ? 172 : 48;
+      break;
+   case ILO_GEN(6):
+   default:
+      /* from the classic driver instead of the PRM */
+      thread_count = (dev->gt == 2) ? 80 : 40;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+ps_params_get_gen6_kill_pixel(const struct ilo_dev *dev,
+                              const struct ilo_state_ps_params_info *params,
+                              const struct ilo_state_ps_dispatch_conds *conds)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the
+    *      PS kernel or color calculator has the ability to kill (discard)
+    *      pixels or samples, other than due to depth or stencil testing.
+    *      This bit is required to be ENABLED in the following situations:
+    *
+    *      The API pixel shader program contains "killpix" or "discard"
+    *      instructions, or other code in the pixel shader kernel that can
+    *      cause the final pixel mask to differ from the pixel mask received
+    *      on dispatch.
+    *
+    *      A sampler with chroma key enabled with kill pixel mode is used by
+    *      the pixel shader.
+    *
+    *      Any render target has Alpha Test Enable or AlphaToCoverage Enable
+    *      enabled.
+    *
+    *      The pixel shader kernel generates and outputs oMask.
+    *
+    *      Note: As ClipDistance clipping is fully supported in hardware and
+    *      therefore not via PS instructions, there should be no need to
+    *      ENABLE this bit due to ClipDistance clipping."
+    */
+   return (conds->ps_may_kill || params->alpha_may_kill);
+}
+
+static bool
+ps_params_get_gen6_dispatch_enable(const struct ilo_dev *dev,
+                                   const struct ilo_state_ps_params_info *params,
+                                   const struct ilo_state_ps_dispatch_conds *conds)
+{
+   /*
+    * We want to skip dispatching when EarlyZ suffices.  The conditions that
+    * require dispatching are
+    *
+    *  - PS writes RTs and RTs are writeable
+    *  - PS changes depth value and depth test/write is enabled
+    *  - PS changes stencil value and stencil test is enabled
+    *  - PS writes UAVs
+    *  - PS or CC kills pixels
+    *  - EDSC is PSEXEC, and depth test/write or stencil test is enabled
+    */
+   bool dispatch_required =
+      ((conds->has_rt_write && params->has_writeable_rt) ||
+       conds->write_odepth ||
+       conds->write_ostencil ||
+       conds->has_uav_write ||
+       ps_params_get_gen6_kill_pixel(dev, params, conds) ||
+       params->earlyz_control_psexec);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 280:
+    *
+    *     "If EDSC_PSEXEC mode is selected, Thread Dispatch Enable must be
+    *      set."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && params->earlyz_control_psexec)
+      dispatch_required = true;
+
+   /* assert it is valid to dispatch */
+   if (dispatch_required)
+      assert(conds->ps_valid);
+
+   return dispatch_required;
+}
+
+static bool
+ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   uint32_t scratch_size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ff->dispatch_modes = ps_get_gen6_dispatch_modes(dev, info);
+
+   /* initialize kernel offsets and GRF starts */
+   if (util_is_power_of_two(ff->dispatch_modes)) {
+      if (ff->dispatch_modes & GEN6_PS_DISPATCH_8) {
+         ff->kernel_offsets[0] = kernel_8->offset;
+         ff->grf_starts[0] = kernel_8->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_16) {
+         ff->kernel_offsets[0] = kernel_16->offset;
+         ff->grf_starts[0] = kernel_16->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_32) {
+         ff->kernel_offsets[0] = kernel_32->offset;
+         ff->grf_starts[0] = kernel_32->grf_start;
+      }
+   } else {
+      ff->kernel_offsets[0] = kernel_8->offset;
+      ff->kernel_offsets[1] = kernel_32->offset;
+      ff->kernel_offsets[2] = kernel_16->offset;
+
+      ff->grf_starts[0] = kernel_8->grf_start;
+      ff->grf_starts[1] = kernel_32->grf_start;
+      ff->grf_starts[2] = kernel_16->grf_start;
+   }
+
+   /* we do not want to save it */
+   assert(ff->kernel_offsets[0] == 0);
+
+   ff->pcb_enable = (((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+                      kernel_8->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+                      kernel_16->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+                      kernel_32->pcb_attr_count));
+
+   scratch_size = 0;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+       scratch_size < kernel_8->scratch_size)
+      scratch_size = kernel_8->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+       scratch_size < kernel_16->scratch_size)
+      scratch_size = kernel_16->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+       scratch_size < kernel_32->scratch_size)
+      scratch_size = kernel_32->scratch_size;
+
+   /* next power of two, starting from 1KB */
+   ff->scratch_space = (scratch_size > 1024) ?
+      (util_last_bit(scratch_size - 1) - 10): 0;
+
+   /* GPU hangs on Haswell if none of the dispatch mode bits is set */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes)
+      ff->dispatch_modes |= GEN6_PS_DISPATCH_8;
+
+   return true;
+}
+
+static bool
+ps_get_gen6_ff(const struct ilo_dev *dev,
+               const struct ilo_state_ps_info *info,
+               struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_resource_info *resource = &info->resource;
+   const struct ilo_state_ps_io_info *io = &info->io;
+   const struct ilo_state_ps_params_info *params = &info->params;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   memset(ff, 0, sizeof(*ff));
+
+   if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff))
+      return false;
+
+   ff->sampler_count = (resource->sampler_count <= 12) ?
+      (resource->sampler_count + 3) / 4 : 4;
+   ff->surface_count = resource->surface_count;
+   ff->has_uav = resource->has_uav;
+
+   ff->thread_count = ps_get_gen6_thread_count(dev, info);
+
+   ff->conds.ps_valid = (info->valid_kernels != 0x0);
+   ff->conds.has_rt_write = io->has_rt_write;
+   ff->conds.write_odepth = (io->pscdepth != GEN7_PSCDEPTH_OFF);
+   ff->conds.write_ostencil = false;
+   ff->conds.has_uav_write = resource->has_uav;
+   ff->conds.ps_may_kill = (io->write_pixel_mask || io->write_omask);
+
+   ff->kill_pixel = ps_params_get_gen6_kill_pixel(dev, params, &ff->conds);
+   ff->dispatch_enable =
+      ps_params_get_gen6_dispatch_enable(dev, params, &ff->conds);
+   ff->dual_source_blending = params->dual_source_blending;
+   ff->sample_mask = params->sample_mask;
+
+   return true;
+}
+
+static bool
+ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
+
+   dw5 = ff->thread_count << GEN6_WM_DW5_MAX_THREADS__SHIFT |
+         ff->dispatch_modes << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;
+
+   if (ff->kill_pixel)
+      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;
+
+   if (io->pscdepth != GEN7_PSCDEPTH_OFF)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
+   if (io->use_z)
+      dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
+
+   if (ff->dispatch_enable)
+      dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+   if (io->write_omask)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_OMASK;
+   if (io->use_w)
+      dw5 |= GEN6_WM_DW5_PS_USE_W;
+
+   if (ff->dual_source_blending)
+      dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+
+   dw6 = io->attr_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
+         io->posoffset << GEN6_WM_DW6_PS_POSOFFSET__SHIFT;
+
+   dw6 |= (info->per_sample_dispatch) ?
+      GEN6_WM_DW6_MSDISPMODE_PERSAMPLE : GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 7);
+   ps->ps[0] = dw2;
+   ps->ps[1] = dw3;
+   ps->ps[2] = dw4;
+   ps->ps[3] = dw5;
+   ps->ps[4] = dw6;
+   ps->ps[5] = ff->kernel_offsets[1];
+   ps->ps[6] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw1 = io->pscdepth << GEN7_WM_DW1_PSCDEPTH__SHIFT;
+
+   if (ff->dispatch_enable)
+      dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+   if (ff->kill_pixel)
+      dw1 |= GEN7_WM_DW1_PS_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN7_WM_DW1_PS_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN7_WM_DW1_PS_USE_W;
+   if (io->use_coverage_mask)
+      dw1 |= GEN7_WM_DW1_PS_USE_COVERAGE_MASK;
+
+   dw2 = (info->per_sample_dispatch) ?
+      GEN7_WM_DW2_MSDISPMODE_PERSAMPLE : GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 2);
+   ps->ps[0] = dw1;
+   ps->ps[1] = dw2;
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      dw4 |= ff->thread_count << GEN75_PS_DW4_MAX_THREADS__SHIFT |
+             (ff->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   } else {
+      dw4 |= ff->thread_count << GEN7_PS_DW4_MAX_THREADS__SHIFT;
+   }
+
+   if (ff->pcb_enable)
+      dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE;
+   if (io->attr_count)
+      dw4 |= GEN7_PS_DW4_ATTR_ENABLE;
+   if (io->write_omask)
+      dw4 |= GEN7_PS_DW4_COMPUTE_OMASK;
+   if (info->rt_clear_enable)
+      dw4 |= GEN7_PS_DW4_RT_FAST_CLEAR;
+   if (ff->dual_source_blending)
+      dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+   if (info->rt_resolve_enable)
+      dw4 |= GEN7_PS_DW4_RT_RESOLVE;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff->has_uav)
+      dw4 |= GEN75_PS_DW4_ACCESS_UAV;
+
+   dw5 = ff->grf_starts[0] << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 8);
+   ps->ps[2] = dw2;
+   ps->ps[3] = dw3;
+   ps->ps[4] = dw4;
+   ps->ps[5] = dw5;
+   ps->ps[6] = ff->kernel_offsets[1];
+   ps->ps[7] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw3, dw4, dw6, dw7;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw3 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT |
+         io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN8_PS_DW6_DISPATCH_MODE__SHIFT;
+
+   if (ff->pcb_enable)
+      dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE;
+
+   if (info->rt_clear_enable)
+      dw6 |= GEN8_PS_DW6_RT_FAST_CLEAR;
+   if (info->rt_resolve_enable)
+      dw6 |= GEN8_PS_DW6_RT_RESOLVE;
+
+   dw7 = ff->grf_starts[0] << GEN8_PS_DW7_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN8_PS_DW7_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN8_PS_DW7_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 6);
+   ps->ps[0] = dw3;
+   ps->ps[1] = dw4;
+   ps->ps[2] = dw6;
+   ps->ps[3] = dw7;
+   ps->ps[4] = ff->kernel_offsets[1];
+   ps->ps[5] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS_EXTRA(struct ilo_state_ps *ps,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_ps_info *info,
+                             const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = io->pscdepth << GEN8_PSX_DW1_PSCDEPTH__SHIFT;
+
+   if (info->valid_kernels)
+      dw1 |= GEN8_PSX_DW1_VALID;
+   if (!io->has_rt_write)
+      dw1 |= GEN8_PSX_DW1_UAV_ONLY;
+   if (io->write_omask)
+      dw1 |= GEN8_PSX_DW1_COMPUTE_OMASK;
+   if (io->write_pixel_mask)
+      dw1 |= GEN8_PSX_DW1_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN8_PSX_DW1_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN8_PSX_DW1_USE_W;
+   if (io->attr_count)
+      dw1 |= GEN8_PSX_DW1_ATTR_ENABLE;
+
+   if (info->per_sample_dispatch)
+      dw1 |= GEN8_PSX_DW1_PER_SAMPLE;
+   if (ff->has_uav)
+      dw1 |= GEN8_PSX_DW1_ACCESS_UAV;
+   if (io->use_coverage_mask)
+      dw1 |= GEN8_PSX_DW1_USE_COVERAGE_MASK;
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 151:
+    *
+    *     "When this bit (Pixel Shader Valid) clear the rest of this command
+    *      should also be clear.
+    */
+   if (!info->valid_kernels)
+      dw1 = 0;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 5);
+   ps->ps[4] = dw1;
+
+   return true;
+}
+
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info)
+{
+   struct pixel_ff ff;
+   bool ret = true;
+
+   assert(ilo_is_zeroed(ps, sizeof(*ps)));
+
+   ret &= ps_get_gen6_ff(dev, info, &ff);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= ps_set_gen8_3DSTATE_PS(ps, dev, info, &ff);
+      ret &= ps_set_gen8_3DSTATE_PS_EXTRA(ps, dev, info, &ff);
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= ps_set_gen7_3dstate_wm(ps, dev, info, &ff);
+      ret &= ps_set_gen7_3DSTATE_PS(ps, dev, info, &ff);
+   } else {
+      ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff);
+   }
+
+   /* save conditions */
+   ps->conds = ff.conds;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_ps_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_ps_init(ps, dev, &info);
+}
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* modify sample mask */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      ps->ps[4] = (ps->ps[4] & ~GEN75_PS_DW4_SAMPLE_MASK__MASK) |
+         (params->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   }
+
+   /* modify dispatch enable, pixel kill, and dual source blending */
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_KILL_PIXEL;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[4] |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[4] &= ~GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+      } else {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_KILL_PIXEL;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[3] |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+      }
+   }
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
new file mode 100644
index 00000000000..38c0b719ab3
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -0,0 +1,464 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_state_sol.h"
+
+static bool
+sol_stream_validate_gen7(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_stream_info *stream)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   assert(stream->vue_read_base + stream->vue_read_count <=
+         stream->cv_vue_attr_count);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 200:
+    *
+    *     "(Stream 0 Vertex Read Offset)
+    *      Format: U1 count of 256-bit units
+    *
+    *      Specifies amount of data to skip over before reading back Stream 0
+    *      vertex data. Must be zero if the GS is enabled and the Output
+    *      Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B
+    *      unit)."
+    *
+    *     "(Stream 0 Vertex Read Length)
+    *      Format: U5-1 count of 256-bit units
+    *
+    *      Specifies amount of vertex data to read back for Stream 0 vertices,
+    *      starting at the Stream 0 Vertex Read Offset location. Maximum
+    *      readback is 17 256-bit units (34 128-bit vertex attributes). Read
+    *      data past the end of the valid vertex data has undefined contents,
+    *      and therefore shouldn't be used to source stream out data.  Must be
+    *      zero (i.e., read length = 256b) if the GS is enabled and the Output
+    *      Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B
+    *      unit)."
+    */
+   assert(stream->vue_read_base == 0 || stream->vue_read_base == 2);
+   assert(stream->vue_read_count <= 34);
+
+   assert(stream->decl_count <= ILO_STATE_SOL_MAX_DECL_COUNT);
+
+   for (i = 0; i < stream->decl_count; i++) {
+      const struct ilo_state_sol_decl_info *decl = &stream->decls[i];
+
+      assert(decl->is_hole || decl->attr < stream->vue_read_count);
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 205:
+       *
+       *     "There is only enough internal storage for the 128-bit vertex
+       *      header and 32 128-bit vertex attributes."
+       */
+      assert(decl->attr < 33);
+
+      assert(decl->component_base < 4 &&
+             decl->component_base + decl->component_count <= 4);
+      assert(decl->buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT);
+   }
+
+   return true;
+}
+
+static bool
+sol_validate_gen7(const struct ilo_dev *dev,
+                  const struct ilo_state_sol_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 198:
+    *
+    *     "This bit (Render Stream Select) is used even if SO Function Enable
+    *      is DISABLED."
+    *
+    * From the Haswell PRM, volume 2b, page 796:
+    *
+    *     "SO Function Enable must also be ENABLED in order for thiis field
+    *      (Render Stream Select) to select a stream for rendering. When SO
+    *      Function Enable is DISABLED and Rendering Disable is cleared (i.e.,
+    *      rendering is enabled), StreamID is ignored downstream of the SO
+    *      stage, allowing any stream to be rendered."
+    *
+    * We want Gen7 behavior, but we have to require users to follow Gen7.5
+    * behavior: info->sol_enable must be set for info->render_stream to work.
+    */
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      if (!sol_stream_validate_gen7(dev, &info->streams[i]))
+         return false;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface Pitch)
+    *      [0,2048]  Must be 0 or a multiple of 4 Bytes."
+    */
+   for (i = 0; i < ARRAY_SIZE(info->buffer_strides); i++) {
+      assert(info->buffer_strides[i] <= 2048 &&
+             info->buffer_strides[i] % 4 == 0);
+   }
+
+   return true;
+}
+
+static bool
+sol_set_gen7_3DSTATE_STREAMOUT(struct ilo_state_sol *sol,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_sol_info *info)
+{
+   struct {
+      uint8_t offset;
+      uint8_t len;
+   } vue_read[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint8_t i;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!sol_validate_gen7(dev, info))
+      return false;
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      const struct ilo_state_sol_stream_info *stream = &info->streams[i];
+
+      vue_read[i].offset = stream->vue_read_base / 2;
+      /*
+       * In pairs minus 1.  URB entries are aligned to 512-bits.  There is no
+       * need to worry about reading past entries.
+       */
+      vue_read[i].len = (stream->vue_read_count + 1) / 2;
+      if (vue_read[i].len)
+         vue_read[i].len--;
+   }
+
+   dw1 = info->render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT |
+         info->tristrip_reorder << GEN7_SO_DW1_REORDER_MODE__SHIFT;
+
+   if (info->sol_enable)
+      dw1 |= GEN7_SO_DW1_SO_ENABLE;
+
+   if (info->render_disable)
+      dw1 |= GEN7_SO_DW1_RENDER_DISABLE;
+
+   if (info->stats_enable)
+      dw1 |= GEN7_SO_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      const uint8_t buffer_enables = ((bool) info->buffer_strides[3]) << 3 |
+                                     ((bool) info->buffer_strides[2]) << 2 |
+                                     ((bool) info->buffer_strides[1]) << 1 |
+                                     ((bool) info->buffer_strides[0]);
+      dw1 |= buffer_enables << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT;
+   }
+
+   dw2 = vue_read[3].offset << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT |
+         vue_read[3].len << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT |
+         vue_read[2].offset << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT |
+         vue_read[2].len << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT |
+         vue_read[1].offset << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT |
+         vue_read[1].len << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT |
+         vue_read[0].offset << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT |
+         vue_read[0].len << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->streamout) >= 2);
+   sol->streamout[0] = dw1;
+   sol->streamout[1] = dw2;
+
+   memcpy(sol->strides, info->buffer_strides, sizeof(sol->strides));
+
+   return true;
+}
+
+static bool
+sol_set_gen7_3DSTATE_SO_DECL_LIST(struct ilo_state_sol *sol,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_sol_info *info,
+                                  uint8_t max_decl_count)
+{
+   uint64_t decl_list[ILO_STATE_SOL_MAX_DECL_COUNT];
+   uint8_t decl_counts[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint8_t buffer_selects[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint32_t dw1, dw2;
+   uint8_t i, j;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   memset(decl_list, 0, sizeof(decl_list[0]) * max_decl_count);
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      const struct ilo_state_sol_stream_info *stream = &info->streams[i];
+
+      assert(stream->decl_count <= max_decl_count);
+      decl_counts[i] = stream->decl_count;
+      buffer_selects[i] = 0;
+
+      for (j = 0; j < stream->decl_count; j++) {
+         const struct ilo_state_sol_decl_info *decl = &stream->decls[j];
+         const uint8_t mask = ((1 << decl->component_count) - 1) <<
+            decl->component_base;
+         uint16_t val;
+
+         val = decl->buffer << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
+               mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
+
+         if (decl->is_hole)
+            val |= GEN7_SO_DECL_HOLE_FLAG;
+         else
+            val |= decl->attr << GEN7_SO_DECL_REG_INDEX__SHIFT;
+
+         decl_list[j] |= (uint64_t) val << (16 * i);
+         buffer_selects[i] |= 1 << decl->buffer;
+      }
+   }
+
+   dw1 = buffer_selects[3] << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT |
+         buffer_selects[2] << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT |
+         buffer_selects[1] << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT |
+         buffer_selects[0] << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT;
+   dw2 = decl_counts[3] << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT |
+         decl_counts[2] << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT |
+         decl_counts[1] << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT |
+         decl_counts[0] << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->so_decl) >= 2);
+   sol->so_decl[0] = dw1;
+   sol->so_decl[1] = dw2;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->decl[0]) == 2);
+   memcpy(sol->decl, decl_list, sizeof(sol->decl[0]) * max_decl_count);
+   sol->decl_count = max_decl_count;
+
+   return true;
+}
+
+static bool
+sol_buffer_validate_gen7(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface Base Address) This field specifies the starting DWord
+    *      address..."
+    */
+   assert(info->offset % 4 == 0);
+
+   /* Gen8+ only */
+   if (info->write_offset_load || info->write_offset_save)
+      assert(ilo_dev_gen(dev) >= ILO_GEN(8));
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 206:
+    *
+    *     "This field (Stream Offset) specifies the Offset in stream output
+    *      buffer to start at, or whether to append to the end of an existing
+    *      buffer. The Offset must be DWORD aligned."
+    */
+   if (info->write_offset_imm_enable) {
+      assert(info->write_offset_load);
+      assert(info->write_offset_imm % 4 == 0);
+   }
+
+   return true;
+}
+
+static uint32_t
+sol_buffer_get_gen6_size(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_buffer_info *info)
+{
+   uint32_t size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface End Address) This field specifies the ending DWord
+    *      address..."
+    */
+   size &= ~3;
+
+   return size;
+}
+
+static bool
+sol_buffer_set_gen7_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_sol_buffer_info *info)
+{
+   const uint32_t size = sol_buffer_get_gen6_size(dev, info);
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   if (!sol_buffer_validate_gen7(dev, info))
+      return false;
+
+   STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 2);
+   sb->so_buf[0] = info->offset;
+   sb->so_buf[1] = (size) ? info->offset + size : 0;
+
+   return true;
+}
+
+static bool
+sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_sol_buffer_info *info)
+{
+   const uint32_t size = sol_buffer_get_gen6_size(dev, info);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!sol_buffer_validate_gen7(dev, info))
+      return false;
+
+   dw1 = 0;
+
+   if (info->buf)
+      dw1 |= GEN8_SO_BUF_DW1_ENABLE;
+   if (info->write_offset_load)
+      dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE;
+   if (info->write_offset_save)
+      dw1 |= GEN8_SO_BUF_DW1_OFFSET_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 4);
+   sb->so_buf[0] = dw1;
+   sb->so_buf[1] = info->offset;
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 205:
+    *
+    *     "This field (Surface Size) specifies the size of buffer in number
+    *      DWords minus 1 of the buffer in Graphics Memory."
+    */
+   sb->so_buf[2] = (size) ? size / 4 - 1 : 0;
+
+   /* load from imm or sb->write_offset_bo */
+   sb->so_buf[3] = (info->write_offset_imm_enable) ?
+      info->write_offset_imm : ~0u;
+
+   return true;
+}
+
+bool
+ilo_state_sol_init(struct ilo_state_sol *sol,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sol_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sol, sizeof(*sol)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      uint8_t max_decl_count, i;
+
+      max_decl_count = info->streams[0].decl_count;
+      for (i = 1; i < ARRAY_SIZE(info->streams); i++) {
+         if (max_decl_count < info->streams[i].decl_count)
+            max_decl_count = info->streams[i].decl_count;
+      }
+
+      assert(ilo_state_sol_data_size(dev, max_decl_count) <= info->data_size);
+      sol->decl = (uint32_t (*)[2]) info->data;
+
+      ret &= sol_set_gen7_3DSTATE_STREAMOUT(sol, dev, info);
+      ret &= sol_set_gen7_3DSTATE_SO_DECL_LIST(sol, dev, info, max_decl_count);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
+                            const struct ilo_dev *dev,
+                            bool render_disable)
+{
+   struct ilo_state_sol_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.render_disable = render_disable;
+
+   return ilo_state_sol_init(sol, dev, &info);
+}
+
+bool
+ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_sol_buffer_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sb, sizeof(*sb)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= sol_buffer_set_gen8_3dstate_so_buffer(sb, dev, info);
+   else
+      ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info);
+
+   sb->need_bo = (info->size > 0);
+   sb->need_write_offset_bo = (info->write_offset_save ||
+         (info->write_offset_load && !info->write_offset_imm_enable));
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb,
+                                   const struct ilo_dev *dev)
+{
+   struct ilo_state_sol_buffer_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_sol_buffer_init(sb, dev, &info);
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
new file mode 100644
index 00000000000..2513fcb4979
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -0,0 +1,166 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_SOL_H
+#define ILO_STATE_SOL_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 193:
+ *
+ *     "Incoming topologies are tagged with a 2-bit StreamID."
+ */
+#define ILO_STATE_SOL_MAX_STREAM_COUNT 4
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 195:
+ *
+ *     "Up to four SO buffers are supported."
+ */
+#define ILO_STATE_SOL_MAX_BUFFER_COUNT 4
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 201:
+ *
+ *     "All 128 decls..."
+ */
+#define ILO_STATE_SOL_MAX_DECL_COUNT 128
+
+/**
+ * Output a vertex attribute.
+ */
+struct ilo_state_sol_decl_info {
+   /* select an attribute from read ones */
+   uint8_t attr;
+   bool is_hole;
+
+   /* which components to write */
+   uint8_t component_base;
+   uint8_t component_count;
+
+   /* destination buffer */
+   uint8_t buffer;
+};
+
+struct ilo_state_sol_stream_info {
+   /* which VUE attributes to read */
+   uint8_t cv_vue_attr_count;
+   uint8_t vue_read_base;
+   uint8_t vue_read_count;
+
+   uint8_t decl_count;
+   const struct ilo_state_sol_decl_info *decls;
+};
+
+struct ilo_state_sol_info {
+   void *data;
+   size_t data_size;
+
+   bool sol_enable;
+   bool stats_enable;
+   enum gen_reorder_mode tristrip_reorder;
+
+   bool render_disable;
+   /* ignored when SOL is disabled */
+   uint8_t render_stream;
+
+   /* a buffer is disabled when its stride is zero */
+   uint16_t buffer_strides[ILO_STATE_SOL_MAX_BUFFER_COUNT];
+
+   struct ilo_state_sol_stream_info streams[ILO_STATE_SOL_MAX_STREAM_COUNT];
+};
+
+struct ilo_state_sol {
+   uint32_t streamout[2];
+   uint16_t strides[4];
+
+   uint32_t so_decl[2];
+   uint32_t (*decl)[2];
+   uint8_t decl_count;
+};
+
+struct ilo_buffer;
+
+struct ilo_state_sol_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   /*
+    * Gen8+ only.  When enabled, require a write offset bo of at least
+    * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes
+    */
+   bool write_offset_load;
+   bool write_offset_save;
+
+   bool write_offset_imm_enable;
+   uint32_t write_offset_imm;
+};
+
+struct ilo_state_sol_buffer {
+   uint32_t so_buf[4];
+
+   bool need_bo;
+   bool need_write_offset_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+   struct intel_bo *write_offset_bo;
+};
+
+static inline size_t
+ilo_state_sol_data_size(const struct ilo_dev *dev, uint8_t max_decl_count)
+{
+   const struct ilo_state_sol *so = NULL;
+   return (ilo_dev_gen(dev) >= ILO_GEN(7)) ?
+      sizeof(so->decl[0]) * max_decl_count : 0;
+}
+
+bool
+ilo_state_sol_init(struct ilo_state_sol *sol,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sol_info *info);
+
+bool
+ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
+                            const struct ilo_dev *dev,
+                            bool render_disable);
+
+bool
+ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_sol_buffer_info *info);
+
+bool
+ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb,
+                                   const struct ilo_dev *dev);
+
+#endif /* ILO_STATE_SOL_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
new file mode 100644
index 00000000000..5be9f8f6270
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -0,0 +1,1179 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_image.h"
+#include "ilo_state_surface.h"
+
+static bool
+surface_set_gen6_null_SURFACE_STATE(struct ilo_state_surface *surf,
+                                    const struct ilo_dev *dev)
+{
+   uint32_t dw0, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 71:
+    *
+    *     "All of the remaining fields in surface state are ignored for null
+    *      surfaces, with the following exceptions:
+    *
+    *        - [DevSNB+]: Width, Height, Depth, and LOD fields must match the
+    *          depth buffer's corresponding state for all render target
+    *          surfaces, including null.
+    *        - Surface Format must be R8G8B8A8_UNORM."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 82:
+    *
+    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must
+    *      be true"
+    *
+    * Note that we ignore the first exception for all surface types.
+    */
+   dw0 = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         GEN6_FORMAT_R8G8B8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT;
+   dw3 = GEN6_TILING_X << GEN6_SURFACE_DW3_TILING__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = 0;
+   surf->surface[2] = 0;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf,
+                                    const struct ilo_dev *dev)
+{
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   dw0 = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         GEN6_FORMAT_R8G8B8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      dw0 |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT;
+   else
+      dw0 |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   memset(&surf->surface[1], 0, sizeof(uint32_t) *
+         (((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 13 : 8) - 1));
+
+   return true;
+}
+
+static bool
+surface_validate_gen6_buffer(const struct ilo_dev *dev,
+                             const struct ilo_state_surface_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* SVB writes are Gen6-only */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
+
+   if (info->offset + info->size > info->buf->bo_size) {
+      ilo_warn("invalid buffer range\n");
+      return false;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B]
+    *      For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]"
+    */
+   if (!info->struct_size || info->struct_size > 2048) {
+      ilo_warn("invalid buffer struct size\n");
+      return false;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "The Base Address for linear render target surfaces and surfaces
+    *      accessed with the typed surface read/write data port messages must
+    *      be element-size aligned, for non-YUV surface formats, or a multiple
+    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
+    *      have no alignment requirements (byte alignment is sufficient)."
+    *
+    *     "Certain message types used to access surfaces have more stringent
+    *      alignment requirements. Please refer to the specific message
+    *      documentation for additional restrictions."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
+    *
+    *     "the surface base address must be OWord aligned"
+    *
+    * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual
+    * Block Read/Write.
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
+    *
+    *     "The surface base address must be DWord aligned"
+    *
+    * for DWord Scattered Read/Write and Byte Scattered Read/Write.
+    *
+    * We have to rely on users to correctly set info->struct_size here.  DWord
+    * Scattered Read/Write has conflicting pitch and alignment, but we do not
+    * use them yet so we are fine.
+    *
+    * It is unclear if sampling engine surfaces require aligned offsets.
+    */
+   if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
+      assert(info->struct_size % info->format_size == 0);
+
+      if (info->offset % info->struct_size) {
+         ilo_warn("bad buffer offset\n");
+         return false;
+      }
+   }
+
+   if (info->format == GEN6_FORMAT_RAW) {
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 97:
+       *
+       *     ""RAW" is supported only with buffers and structured buffers
+       *      accessed via the untyped surface read/write and untyped atomic
+       *      operation messages, which do not have a column in the table."
+       *
+       * We do not have a specific access mode for untyped messages.
+       */
+      assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED);
+
+      /*
+       * Nothing is said about Untyped* messages, but I guess they require the
+       * base address to be DWord aligned.
+       */
+      if (info->offset % 4) {
+         ilo_warn("bad RAW buffer offset\n");
+         return false;
+      }
+
+      if (info->struct_size > 1) {
+         /* no STRBUF on Gen6 */
+         if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+            ilo_warn("no STRBUF support\n");
+            return false;
+         }
+
+         /*
+          * From the Ivy Bridge PRM, volume 4 part 1, page 70:
+          *
+          *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
+          *      pitch must be a multiple of 4 bytes."
+          */
+         if (info->struct_size % 4) {
+            ilo_warn("bad STRBUF pitch\n");
+            return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+static bool
+surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info,
+                                     uint32_t *count)
+{
+   uint32_t max_struct, c;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   c = info->size / info->struct_size;
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB &&
+       info->format_size < info->size - info->struct_size * c)
+      c++;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 77:
+    *
+    *     "For buffer surfaces, the number of entries in the buffer ranges
+    *      from 1 to 2^27."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "For typed buffer and structured buffer surfaces, the number of
+    *      entries in the buffer ranges from 1 to 2^27.  For raw buffer
+    *      surfaces, the number of entries in the buffer is the number of
+    *      bytes which can range from 1 to 2^30."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
+    *
+    *      For SURFTYPE_BUFFER: The low two bits of this field (Width) must be
+    *      11 if the Surface Format is RAW (the size of the buffer must be a
+    *      multiple of 4 bytes)."
+    */
+   max_struct = 1 << 27;
+   if (info->format == GEN6_FORMAT_RAW && info->struct_size == 1) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         max_struct = 1 << 30;
+
+      c &= ~3;
+   }
+
+   if (!c || c > max_struct) {
+      ilo_warn("too many or zero buffer structs\n");
+      return false;
+   }
+
+   *count = c - 1;
+
+   return true;
+}
+
+static bool
+surface_set_gen6_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t dw0, dw1, dw2, dw3;
+   uint32_t struct_count;
+   int width, height, depth;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!surface_validate_gen6_buffer(dev, info) ||
+       !surface_get_gen6_buffer_struct_count(dev, info, &struct_count))
+      return false;
+
+   /* bits [6:0] */
+   width  = (struct_count & 0x0000007f);
+   /* bits [19:7] */
+   height = (struct_count & 0x000fff80) >> 7;
+   /* bits [26:20] */
+   depth  = (struct_count & 0x07f00000) >> 20;
+
+   dw0 = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT;
+   dw1 = info->offset;
+   dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN6_SURFACE_DW2_WIDTH__SHIFT;
+   dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
+         (info->struct_size - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+
+   surf->type = GEN6_SURFTYPE_BUFFER;
+   surf->min_lod = 0;
+   surf->mip_count = 0;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t dw0, dw1, dw2, dw3, dw7;
+   enum gen_surface_type type;
+   uint32_t struct_count;
+   int width, height, depth;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!surface_validate_gen6_buffer(dev, info) ||
+       !surface_get_gen6_buffer_struct_count(dev, info, &struct_count))
+      return false;
+
+   type = (info->format == GEN6_FORMAT_RAW && info->struct_size > 1) ?
+      GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER;
+
+   /* bits [6:0] */
+   width  = (struct_count & 0x0000007f);
+   /* bits [20:7] */
+   height = (struct_count & 0x001fff80) >> 7;
+   /* bits [30:21] */
+   depth  = (struct_count & 0x7fe00000) >> 21;
+
+   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
+   dw1 = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 0 : info->offset;
+   dw2 = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) |
+         GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH);
+   dw3 = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) |
+         GEN_SHIFT32(info->struct_size - 1, GEN7_SURFACE_DW3_PITCH);
+
+   dw7 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      dw7 |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
+             GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
+             GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
+             GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+   surf->surface[6] = 0;
+   surf->surface[7] = dw7;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      surf->surface[8] = info->offset;
+      surf->surface[9] = 0;
+      surf->surface[10] = 0;
+      surf->surface[11] = 0;
+      surf->surface[12] = 0;
+   }
+
+   surf->type = type;
+   surf->min_lod = 0;
+   surf->mip_count = 0;
+
+   return true;
+}
+
+static enum gen_surface_type
+get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (img->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static bool
+surface_validate_gen6_image(const struct ilo_dev *dev,
+                            const struct ilo_state_surface_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (info->access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+      break;
+   default:
+      assert(!"unsupported surface access");
+      break;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+    *
+    *     "For surface types other than SURFTYPE_BUFFER, the Width specified
+    *      by this field must be less than or equal to the surface pitch
+    *      (specified in bytes via the Surface Pitch field)."
+    */
+   assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 &&
+          info->img->width0 <= info->img->bo_stride);
+
+   if (info->is_cube_map) {
+      assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D);
+
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+       *
+       *     "For cube maps, Width must be set equal to the Height."
+       */
+      assert(info->img->width0 == info->img->height0);
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 72:
+    *
+    *     "Tile Walk TILEWALK_YMAJOR is UNDEFINED for render target formats
+    *      that have 128 bits-per-element (BPE)."
+    *
+    *     "If Number of Multisamples is set to a value other than
+    *      MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *      formats:
+    *
+    *      - any format with greater than 64 bits per element
+    *      - any compressed texture format (BC*)
+    *      - any YCRCB* format"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+    *
+    *      If Number of Multisamples is set to a value other than
+    *      MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *      formats: any format with greater than 64 bits per element, if
+    *      Number of Multisamples is MULTISAMPLECOUNT_8, any compressed
+    *      texture format (BC*), and any YCRCB* format.
+    *
+    * TODO
+    */
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && info->img->tiling == GEN8_TILING_W) {
+      ilo_warn("tiling W is not supported\n");
+      return false;
+   }
+
+   return true;
+}
+
+static void
+get_gen6_max_extent(const struct ilo_dev *dev,
+                    const struct ilo_image *img,
+                    uint16_t *max_w, uint16_t *max_h)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+      *max_w = max_size;
+      *max_h = 1;
+      break;
+   case GEN6_SURFTYPE_2D:
+      *max_w = max_size;
+      *max_h = max_size;
+      break;
+   case GEN6_SURFTYPE_3D:
+      *max_w = 2048;
+      *max_h = 2048;
+      break;
+   default:
+      assert(!"invalid surface type");
+      *max_w = 1;
+      *max_h = 1;
+      break;
+   }
+}
+
+static bool
+surface_get_gen6_image_extent(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint16_t *width, uint16_t *height)
+{
+   uint16_t w, h, max_w, max_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = info->img->width0;
+   h = info->img->height0;
+
+   get_gen6_max_extent(dev, info->img, &max_w, &max_h);
+   assert(w && h && w <= max_w && h <= max_h);
+
+   *width = w - 1;
+   *height = h - 1;
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_slices(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint16_t *depth, uint16_t *min_array_elem,
+                              uint16_t *rt_view_extent)
+{
+   uint16_t max_slice, d;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+    *
+    *     "If this field (Surface Array) is enabled, the Surface Type must be
+    *      SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is
+    *      disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or
+    *      SURFTYPE_CUBE, the Depth field must be set to zero."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
+    *
+    *     "This field (Depth) specifies the total number of levels for a
+    *      volume texture or the number of array elements allowed to be
+    *      accessed starting at the Minimum Array Element for arrayed
+    *      surfaces.  If the volume texture is MIP-mapped, this field
+    *      specifies the depth of the base MIP level."
+    *
+    *     "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of this
+    *      field is [0,340], indicating the number of cube array elements
+    *      (equal to the number of underlying 2D array elements divided by 6).
+    *      For other surfaces, this field must be zero."
+    *
+    *     "Errata: For SURFTYPE_CUBE sampling engine surfaces, the range of
+    *      this field is limited to [0,85].
+    *
+    *      Errata: If Surface Array is enabled, and Depth is between 1024 and
+    *      2047, an incorrect array slice may be accessed if the requested
+    *      array index in the message is greater than or equal to 4096."
+    *
+    * The errata are for Gen7-specific, and they limit the number of useable
+    * layers to (86 * 6), about 512.
+    */
+
+   switch (get_gen6_surface_type(dev, info->img)) {
+   case GEN6_SURFTYPE_1D:
+   case GEN6_SURFTYPE_2D:
+      max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512;
+
+      assert(info->img->array_size <= max_slice);
+      max_slice = info->img->array_size;
+
+      d = info->slice_count;
+      if (info->is_cube_map) {
+         if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
+            if (!d || d % 6) {
+               ilo_warn("invalid cube slice count\n");
+               return false;
+            }
+
+            if (ilo_dev_gen(dev) == ILO_GEN(7) && d > 86 * 6) {
+               ilo_warn("cube slice count exceeds Gen7 limit\n");
+               return false;
+            }
+         } else {
+            /*
+             * Minumum Array Element and Depth must be 0; Render Target View
+             * Extent is ignored.
+             */
+            if (info->slice_base || d != 6) {
+               ilo_warn("no cube RT array support in data port\n");
+               return false;
+            }
+         }
+
+         d /= 6;
+      }
+
+      if (!info->is_array && d > 1) {
+         ilo_warn("non-array surface with non-zero depth\n");
+         return false;
+      }
+      break;
+   case GEN6_SURFTYPE_3D:
+      max_slice = 2048;
+
+      assert(info->img->depth0 <= max_slice);
+      max_slice = u_minify(info->img->depth0, info->level_base);
+
+      d = info->img->depth0;
+
+      if (info->is_array) {
+         ilo_warn("3D surfaces cannot be arrays\n");
+         return false;
+      }
+      break;
+   default:
+      assert(!"invalid surface type");
+      return false;
+      break;
+   }
+
+   if (!info->slice_count ||
+       info->slice_base + info->slice_count > max_slice) {
+      ilo_warn("invalid slice range\n");
+      return false;
+   }
+
+   assert(d);
+   *depth = d - 1;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 84:
+    *
+    *     "For Sampling Engine and Render Target 1D and 2D Surfaces:
+    *      This field (Minimum Array Element) indicates the minimum array
+    *      element that can be accessed as part of this surface.  This field
+    *      is added to the delivered array index before it is used to address
+    *      the surface.
+    *
+    *      For Render Target 3D Surfaces:
+    *      This field indicates the minimum `R' coordinate on the LOD
+    *      currently being rendered to.  This field is added to the delivered
+    *      array index before it is used to address the surface.
+    *
+    *      For Sampling Engine Cube Surfaces on [DevSNB+] only:
+    *      This field indicates the minimum array element in the underlying 2D
+    *      surface array that can be accessed as part of this surface (the
+    *      cube array index is multipled by 6 to compute this value, although
+    *      this field is not restricted to only multiples of 6). This field is
+    *      added to the delivered array index before it is used to address the
+    *      surface.
+    *
+    *      For Other Surfaces:
+    *      This field must be set to zero."
+    *
+    * On Gen7+, typed sufaces are treated like sampling engine 1D and 2D
+    * surfaces.
+    */
+   *min_array_elem = info->slice_base;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 84:
+    *
+    *     "For Render Target 3D Surfaces:
+    *      This field (Render Target View Extent) indicates the extent of the
+    *      accessible `R' coordinates minus 1 on the LOD currently being
+    *      rendered to.
+    *
+    *      For Render Target 1D and 2D Surfaces:
+    *      This field must be set to the same value as the Depth field.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *rt_view_extent = info->slice_count - 1;
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_levels(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint8_t *min_lod, uint8_t *mip_count)
+{
+   uint8_t max_level = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 15 : 14;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->img->level_count <= max_level);
+   max_level = info->img->level_count;
+
+   if (!info->level_count ||
+       info->level_base + info->level_count > max_level) {
+      ilo_warn("invalid level range\n");
+      return false;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 79:
+    *
+    *     "For Sampling Engine Surfaces:
+    *      This field (MIP Count / LOD) indicates the number of MIP levels
+    *      allowed to be accessed starting at Surface Min LOD, which must be
+    *      less than or equal to the number of MIP levels actually stored in
+    *      memory for this surface.
+    *
+    *      Force the mip map access to be between the mipmap specified by the
+    *      integer bits of the Min LOD and the ceiling of the value specified
+    *      here.
+    *
+    *      For Render Target Surfaces:
+    *      This field defines the MIP level that is currently being rendered
+    *      into. This is the absolute MIP level on the surface and is not
+    *      relative to the Surface Min LOD field, which is ignored for render
+    *      target surfaces.
+    *
+    *      For Other Surfaces:
+    *      This field is reserved : MBZ"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 83:
+    *
+    *     "For Sampling Engine Surfaces:
+    *
+    *      This field (Surface Min LOD) indicates the most detailed LOD that
+    *      can be accessed as part of this surface.  This field is added to
+    *      the delivered LOD (sample_l, ld, or resinfo message types) before
+    *      it is used to address the surface.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    *
+    * On Gen7+, typed sufaces are treated like sampling engine surfaces.
+    */
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_RENDER) {
+      assert(info->level_count == 1);
+
+      *min_lod = 0;
+      *mip_count = info->level_base;
+   } else {
+      *min_lod = info->level_base;
+      *mip_count = info->level_count - 1;
+   }
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_sample_count(const struct ilo_dev *dev,
+                                    const struct ilo_state_surface_image_info *info,
+                                    enum gen_sample_count *sample_count)
+{
+   int min_gen;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (info->img->sample_count) {
+   case 1:
+      *sample_count = GEN6_NUMSAMPLES_1;
+      min_gen = ILO_GEN(6);
+      break;
+   case 2:
+      *sample_count = GEN8_NUMSAMPLES_2;
+      min_gen = ILO_GEN(8);
+      break;
+   case 4:
+      *sample_count = GEN6_NUMSAMPLES_4;
+      min_gen = ILO_GEN(6);
+      break;
+   case 8:
+      *sample_count = GEN7_NUMSAMPLES_8;
+      min_gen = ILO_GEN(7);
+      break;
+   case 16:
+      *sample_count = GEN8_NUMSAMPLES_16;
+      min_gen = ILO_GEN(8);
+      break;
+   default:
+      assert(!"invalid sample count");
+      *sample_count = GEN6_NUMSAMPLES_1;
+      break;
+   }
+
+   assert(ilo_dev_gen(dev) >= min_gen);
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_alignments(const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_image_info *info,
+                                  uint32_t *alignments)
+{
+   uint32_t a = 0;
+   bool err = false;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      switch (info->img->align_i) {
+      case 4:
+         a |= GEN8_SURFACE_DW0_HALIGN_4;
+         break;
+      case 8:
+         a |= GEN8_SURFACE_DW0_HALIGN_8;
+         break;
+      case 16:
+         a |= GEN8_SURFACE_DW0_HALIGN_16;
+         break;
+      default:
+         err = true;
+         break;
+      }
+
+      switch (info->img->align_j) {
+      case 4:
+         a |= GEN7_SURFACE_DW0_VALIGN_4;
+         break;
+      case 8:
+         a |= GEN8_SURFACE_DW0_VALIGN_8;
+         break;
+      case 16:
+         a |= GEN8_SURFACE_DW0_VALIGN_16;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (info->img->align_i) {
+      case 4:
+         a |= GEN7_SURFACE_DW0_HALIGN_4;
+         break;
+      case 8:
+         a |= GEN7_SURFACE_DW0_HALIGN_8;
+         break;
+      default:
+         err = true;
+         break;
+      }
+
+      switch (info->img->align_j) {
+      case 2:
+         a |= GEN7_SURFACE_DW0_VALIGN_2;
+         break;
+      case 4:
+         a |= GEN7_SURFACE_DW0_VALIGN_4;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   } else {
+      if (info->img->align_i != 4)
+         err = true;
+
+      switch (info->img->align_j) {
+      case 2:
+         a |= GEN6_SURFACE_DW5_VALIGN_2;
+         break;
+      case 4:
+         a |= GEN6_SURFACE_DW5_VALIGN_4;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   }
+
+   if (err)
+      assert(!"invalid HALIGN or VALIGN");
+
+   *alignments = a;
+
+   return true;
+}
+
+static bool
+surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_image_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   uint8_t min_lod, mip_count;
+   enum gen_sample_count sample_count;
+   uint32_t alignments;
+   enum gen_surface_type type;
+   uint32_t dw0, dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!surface_validate_gen6_image(dev, info) ||
+       !surface_get_gen6_image_extent(dev, info, &width, &height) ||
+       !surface_get_gen6_image_slices(dev, info, &depth, &array_base,
+                                      &view_extent) ||
+       !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) ||
+       !surface_get_gen6_image_sample_count(dev, info, &sample_count) ||
+       !surface_get_gen6_image_alignments(dev, info, &alignments))
+      return false;
+
+   /* no ARYSPC_LOD0 */
+   assert(info->img->walk != ILO_IMAGE_WALK_LOD);
+   /* no UMS/CMS */
+   if (info->img->sample_count > 1)
+      assert(info->img->interleaved_samples);
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+      get_gen6_surface_type(dev, info->img);
+
+   dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
+         GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 74:
+    *
+    *     "CUBE_AVERAGE may only be selected if all of the Cube Face Enable
+    *      fields are equal to one."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 75-76:
+    *
+    *     "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine:
+    *      Bits 5:0 of this field (Cube Face Enables) enable the individual
+    *      faces of a cube map.  Enabling a face indicates that the face is
+    *      present in the cube map, while disabling it indicates that that
+    *      face is represented by the texture map's border color. Refer to
+    *      Memory Data Formats for the correlation between faces and the cube
+    *      map memory layout. Note that storage for disabled faces must be
+    *      provided.
+    *
+    *      For other surfaces:
+    *      This field is reserved : MBZ"
+    *
+    *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
+    *      field must be programmed to 111111b (all faces enabled)."
+    */
+   if (info->is_cube_map &&
+       info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
+      dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE |
+             GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
+   }
+
+   dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN6_SURFACE_DW2_WIDTH__SHIFT |
+         mip_count << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
+
+   dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
+         (info->img->bo_stride - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT |
+         info->img->tiling << GEN6_SURFACE_DW3_TILING__SHIFT;
+
+   dw4 = min_lod << GEN6_SURFACE_DW4_MIN_LOD__SHIFT |
+         array_base << GEN6_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN6_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT |
+         sample_count << GEN6_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT;
+
+   dw5 = alignments;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = 0;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = dw4;
+   surf->surface[5] = dw5;
+
+   surf->type = type;
+   surf->min_lod = min_lod;
+   surf->mip_count = mip_count;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_image_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   uint8_t min_lod, mip_count;
+   uint32_t alignments;
+   enum gen_sample_count sample_count;
+   enum gen_surface_type type;
+   uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!surface_validate_gen6_image(dev, info) ||
+       !surface_get_gen6_image_extent(dev, info, &width, &height) ||
+       !surface_get_gen6_image_slices(dev, info, &depth, &array_base,
+                                      &view_extent) ||
+       !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) ||
+       !surface_get_gen6_image_sample_count(dev, info, &sample_count) ||
+       !surface_get_gen6_image_alignments(dev, info, &alignments))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+      get_gen6_surface_type(dev, info->img);
+
+   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT |
+         alignments;
+
+   if (info->is_array)
+      dw0 |= GEN7_SURFACE_DW0_IS_ARRAY;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      dw0 |= info->img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT;
+   } else {
+      dw0 |= info->img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT;
+
+      if (info->img->walk == ILO_IMAGE_WALK_LOD)
+         dw0 |= GEN7_SURFACE_DW0_ARYSPC_LOD0;
+      else
+         dw0 |= GEN7_SURFACE_DW0_ARYSPC_FULL;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 67:
+    *
+    *     "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine: Bits
+    *      5:0 of this field (Cube Face Enables) enable the individual faces
+    *      of a cube map. Enabling a face indicates that the face is present
+    *      in the cube map, while disabling it indicates that that face is
+    *      represented by the texture map's border color. Refer to Memory Data
+    *      Formats for the correlation between faces and the cube map memory
+    *      layout. Note that storage for disabled faces must be provided. For
+    *      other surfaces this field is reserved and MBZ."
+    *
+    *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
+    *      field must be programmed to 111111b (all faces enabled). This field
+    *      is ignored unless the Surface Type is SURFTYPE_CUBE."
+    */
+   if (info->is_cube_map &&
+       info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER)
+      dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
+
+   dw1 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      assert(info->img->walk_layer_height % 4 == 0);
+      dw1 |= info->img->walk_layer_height / 4 <<
+         GEN8_SURFACE_DW1_QPITCH__SHIFT;
+   }
+
+   dw2 = height << GEN7_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN7_SURFACE_DW2_WIDTH__SHIFT;
+
+   dw3 = depth << GEN7_SURFACE_DW3_DEPTH__SHIFT |
+         (info->img->bo_stride - 1) << GEN7_SURFACE_DW3_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5))
+      dw3 |= 0 << GEN75_SURFACE_DW3_INTEGER_SURFACE_FORMAT__SHIFT;
+
+   dw4 = array_base << GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN7_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT |
+         sample_count << GEN7_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT;
+
+   /*
+    * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL
+    * means the samples are interleaved.  The layouts are the same when the
+    * number of samples is 1.
+    */
+   if (info->img->interleaved_samples && info->img->sample_count > 1) {
+      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_RENDER);
+      dw4 |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL;
+   } else {
+      dw4 |= GEN7_SURFACE_DW4_MSFMT_MSS;
+   }
+
+   dw5 = min_lod << GEN7_SURFACE_DW5_MIN_LOD__SHIFT |
+         mip_count << GEN7_SURFACE_DW5_MIP_COUNT_LOD__SHIFT;
+
+   dw7 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      dw7 |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
+             GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
+             GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
+             GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = dw4;
+   surf->surface[5] = dw5;
+   surf->surface[6] = 0;
+   surf->surface[7] = dw7;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      surf->surface[8] = 0;
+      surf->surface[9] = 0;
+      surf->surface[10] = 0;
+      surf->surface[11] = 0;
+      surf->surface[12] = 0;
+   }
+
+   surf->type = type;
+   surf->min_lod = min_lod;
+   surf->mip_count = mip_count;
+
+   return true;
+}
+
+bool
+ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
+                                const struct ilo_dev *dev)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_null_SURFACE_STATE(surf, dev);
+   else
+      ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev);
+
+   surf->type = GEN6_SURFTYPE_NULL;
+   surf->readonly = true;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_buffer_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_buffer_SURFACE_STATE(surf, dev, info);
+   else
+      ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info);
+
+   surf->readonly = info->readonly;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_surface_image_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_image_SURFACE_STATE(surf, dev, info);
+   else
+      ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info);
+
+   surf->is_integer = info->is_integer;
+   surf->readonly = info->readonly;
+   surf->scanout = info->img->scanout;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_set_scs(struct ilo_state_surface *surf,
+                          const struct ilo_dev *dev,
+                          enum gen_surface_scs rgba[4])
+{
+   const uint32_t scs = GEN_SHIFT32(rgba[0], GEN75_SURFACE_DW7_SCS_R) |
+                        GEN_SHIFT32(rgba[1], GEN75_SURFACE_DW7_SCS_G) |
+                        GEN_SHIFT32(rgba[2], GEN75_SURFACE_DW7_SCS_B) |
+                        GEN_SHIFT32(rgba[3], GEN75_SURFACE_DW7_SCS_A);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(ilo_dev_gen(dev) >= ILO_GEN(7.5));
+
+   surf->surface[7] = (surf->surface[7] & ~GEN75_SURFACE_DW7_SCS__MASK) | scs;
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
new file mode 100644
index 00000000000..9c025428d50
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -0,0 +1,121 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_SURFACE_H
+#define ILO_STATE_SURFACE_H
+
+#include "genhw/genhw.h"
+#include "intel_winsys.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_buffer;
+struct ilo_image;
+
+enum ilo_state_surface_access {
+   ILO_STATE_SURFACE_ACCESS_SAMPLER,      /* sampling engine surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_RENDER,    /* render target surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_TYPED,     /* typed surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_UNTYPED,   /* untyped surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_DATA,
+   ILO_STATE_SURFACE_ACCESS_DP_SVB,
+};
+
+struct ilo_state_surface_buffer_info {
+   const struct ilo_buffer *buf;
+
+   enum ilo_state_surface_access access;
+
+   enum gen_surface_format format;
+   uint8_t format_size;
+
+   bool readonly;
+   uint16_t struct_size;
+
+   uint32_t offset;
+   uint32_t size;
+};
+
+struct ilo_state_surface_image_info {
+   const struct ilo_image *img;
+
+   enum ilo_state_surface_access access;
+
+   enum gen_surface_format format;
+   bool is_integer;
+
+   bool readonly;
+   bool is_cube_map;
+   bool is_array;
+
+   uint8_t level_base;
+   uint8_t level_count;
+   uint16_t slice_base;
+   uint16_t slice_count;
+};
+
+struct ilo_state_surface {
+   uint32_t surface[13];
+
+   enum gen_surface_type type;
+   uint8_t min_lod;
+   uint8_t mip_count;
+   bool is_integer;
+
+   bool readonly;
+   bool scanout;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+bool
+ilo_state_surface_valid_format(const struct ilo_dev *dev,
+                               enum ilo_state_surface_access access,
+                               enum gen_surface_format format);
+
+bool
+ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
+                                const struct ilo_dev *dev);
+
+bool
+ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_buffer_info *info);
+
+bool
+ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_surface_image_info *info);
+
+bool
+ilo_state_surface_set_scs(struct ilo_state_surface *surf,
+                          const struct ilo_dev *dev,
+                          enum gen_surface_scs rgba[4]);
+
+#endif /* ILO_STATE_SURFACE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface_format.c b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c
new file mode 100644
index 00000000000..a40c1b84d17
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c
@@ -0,0 +1,351 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2013 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "genhw/genhw.h"
+#include "ilo_state_surface.h"
+
+static bool
+surface_valid_sampler_format(const struct ilo_dev *dev,
+                             enum ilo_state_surface_access access,
+                             enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 4 part 1, page 84-87
+    */
+   static const struct sampler_cap {
+      int sampling;
+      int filtering;
+      int shadow_map;
+      int chroma_key;
+   } caps[] = {
+#define CAP(sampling, filtering, shadow_map, chroma_key) \
+      { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) }
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32X32_FLOAT]       = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT]  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_L32A32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R16G16B16X16_UNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16X16_FLOAT]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_L32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_I32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_R24_UNORM_X8_TYPELESS]    = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_X24_TYPELESS_G8_UINT]     = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_L16A16_UNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_I24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_L24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_A24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_I32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_L32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_A32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8X8_UNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R9G9B9E5_SHAREDEXP]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10X2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_L16A16_FLOAT]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A8P8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_A8P8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_I16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_A16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L8A8_UNORM]               = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_I16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_A16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L8A8_UNORM_SRGB]          = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_R5G5_SNORM_B6_UNORM]      = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P8A8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_P8A8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   0, 4.5),
+      [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_I8_UNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_L8_UNORM]                 = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P4A4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A4P4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_P8_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_L8_UNORM_SRGB]            = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P8_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P4A4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_A4P4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_DXT1_RGB_SRGB]            = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_R1_UNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P2_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P2_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_BC1_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC2_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC3_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC4_UNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC5_UNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC1_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC2_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC3_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_MONO8]                    = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_DXT1_RGB]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_FXT1]                     = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC4_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC5_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_BC6H_SF16]                = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC7_UNORM]                = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC7_UNORM_SRGB]           = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC6H_UF16]                = CAP(  7,   7,   0,   0),
+#undef CAP
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return (format < ARRAY_SIZE(caps) && caps[format].sampling &&
+           ilo_dev_gen(dev) >= caps[format].sampling);
+}
+
+static bool
+surface_valid_dp_format(const struct ilo_dev *dev,
+                        enum ilo_state_surface_access access,
+                        enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278
+    *  - the Haswell PRM, volume 7, page 262-264
+    */
+   static const struct dp_cap {
+      int rt_write;
+      int rt_write_blending;
+      int typed_write;
+      int media_color_processing;
+   } caps[] = {
+#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \
+      { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) }
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1, 4.5,   7,   6),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  0,   0,   0,   6),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   6),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1, 4.5,   7,   0),
+      [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   6),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  0,   0,   0,   6),
+      [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_UNORM]                = CAP(  1, 4.5,   7,   7),
+      [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5X1_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   0,   0,   6),
+#undef CAP
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (format >= ARRAY_SIZE(caps))
+      return false;
+
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+      return (caps[format].rt_write &&
+              ilo_dev_gen(dev) >= caps[format].rt_write);
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      return (caps[format].typed_write &&
+              ilo_dev_gen(dev) >= caps[format].typed_write);
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      return (format == GEN6_FORMAT_RAW);
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /* ignored, but can it be raw? */
+      assert(format != GEN6_FORMAT_RAW);
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+surface_valid_svb_format(const struct ilo_dev *dev,
+                         enum gen_surface_format format)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 2 part 1, page 195
+    *  - the Haswell PRM, volume 7, page 535
+    */
+   switch (format) {
+   case GEN6_FORMAT_R32G32B32A32_FLOAT:
+   case GEN6_FORMAT_R32G32B32A32_SINT:
+   case GEN6_FORMAT_R32G32B32A32_UINT:
+   case GEN6_FORMAT_R32G32B32_FLOAT:
+   case GEN6_FORMAT_R32G32B32_SINT:
+   case GEN6_FORMAT_R32G32B32_UINT:
+   case GEN6_FORMAT_R32G32_FLOAT:
+   case GEN6_FORMAT_R32G32_SINT:
+   case GEN6_FORMAT_R32G32_UINT:
+   case GEN6_FORMAT_R32_SINT:
+   case GEN6_FORMAT_R32_UINT:
+   case GEN6_FORMAT_R32_FLOAT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+ilo_state_surface_valid_format(const struct ilo_dev *dev,
+                               enum ilo_state_surface_access access,
+                               enum gen_surface_format format)
+{
+   bool valid;
+
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      valid = surface_valid_sampler_format(dev, access, format);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      valid = surface_valid_dp_format(dev, access, format);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      valid = surface_valid_svb_format(dev, format);
+      break;
+   default:
+      valid = false;
+      break;
+   }
+
+   return valid;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.c b/src/gallium/drivers/ilo/core/ilo_state_urb.c
new file mode 100644
index 00000000000..cbd150c71c9
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_urb.c
@@ -0,0 +1,769 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_urb.h"
+
+struct urb_configuration {
+   uint8_t vs_pcb_alloc_kb;
+   uint8_t hs_pcb_alloc_kb;
+   uint8_t ds_pcb_alloc_kb;
+   uint8_t gs_pcb_alloc_kb;
+   uint8_t ps_pcb_alloc_kb;
+
+   uint8_t urb_offset_8kb;
+
+   uint8_t vs_urb_alloc_8kb;
+   uint8_t hs_urb_alloc_8kb;
+   uint8_t ds_urb_alloc_8kb;
+   uint8_t gs_urb_alloc_8kb;
+
+   uint8_t vs_entry_rows;
+   uint8_t hs_entry_rows;
+   uint8_t ds_entry_rows;
+   uint8_t gs_entry_rows;
+
+   int vs_entry_count;
+   int hs_entry_count;
+   int ds_entry_count;
+   int gs_entry_count;
+};
+
+static void
+urb_alloc_gen7_pcb(const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info,
+                   struct urb_configuration *conf)
+{
+   /*
+    * From the Haswell PRM, volume 2b, page 940:
+    *
+    *     "[0,16] (0KB - 16KB) Increments of 1KB DevHSW:GT1, DevHSW:GT2
+    *      [0,32] (0KB - 32KB) Increments of 2KB DevHSW:GT3"
+    */
+   const uint8_t increment_kb =
+      (ilo_dev_gen(dev) >= ILO_GEN(8) ||
+       (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 3)) ? 2 : 1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * Keep the strategy simple as we do not know the workloads and how
+    * expensive it is to change the configuration frequently.
+    */
+   if (info->hs_const_data || info->ds_const_data) {
+      conf->vs_pcb_alloc_kb = increment_kb * 4;
+      conf->hs_pcb_alloc_kb = increment_kb * 3;
+      conf->ds_pcb_alloc_kb = increment_kb * 3;
+      conf->gs_pcb_alloc_kb = increment_kb * 3;
+      conf->ps_pcb_alloc_kb = increment_kb * 3;
+   } else if (info->gs_const_data) {
+      conf->vs_pcb_alloc_kb = increment_kb * 6;
+      conf->gs_pcb_alloc_kb = increment_kb * 5;
+      conf->ps_pcb_alloc_kb = increment_kb * 5;
+   } else {
+      conf->vs_pcb_alloc_kb = increment_kb * 8;
+      conf->ps_pcb_alloc_kb = increment_kb * 8;
+   }
+
+   conf->urb_offset_8kb = increment_kb * 16 / 8;
+}
+
+static void
+urb_alloc_gen6_urb(const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info,
+                   struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 34:
+    *
+    *     "(VS URB Starting Address) Offset from the start of the URB memory
+    *      where VS starts its allocation, specified in multiples of 8 KB."
+    *
+    * Same for other stages.
+    */
+   const int space_avail_8kb = dev->urb_size / 8192 - conf->urb_offset_8kb;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 173:
+    *
+    *     "Programming Note: If the GS stage is enabled, software must always
+    *      allocate at least one GS URB Entry. This is true even if the GS
+    *      thread never needs to output vertices to the urb, e.g., when only
+    *      performing stream output. This is an artifact of the need to pass
+    *      the GS thread an initial destination URB handle."
+    */
+   const bool force_gs_alloc =
+      (ilo_dev_gen(dev) == ILO_GEN(6) && info->gs_enable);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->hs_entry_size || info->ds_entry_size) {
+      conf->vs_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->hs_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->ds_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->gs_urb_alloc_8kb = space_avail_8kb / 4;
+
+      if (space_avail_8kb % 4) {
+         assert(space_avail_8kb % 2 == 0);
+         conf->vs_urb_alloc_8kb++;
+         conf->gs_urb_alloc_8kb++;
+      }
+   } else if (info->gs_entry_size || force_gs_alloc) {
+      assert(space_avail_8kb % 2 == 0);
+      conf->vs_urb_alloc_8kb = space_avail_8kb / 2;
+      conf->gs_urb_alloc_8kb = space_avail_8kb / 2;
+   } else {
+      conf->vs_urb_alloc_8kb = space_avail_8kb;
+   }
+}
+
+static bool
+urb_init_gen6_vs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 28:
+    *
+    *     "(VS URB Entry Allocation Size)
+    *      Range [0,4] = [1,5] 1024-bit URB rows"
+    *
+    *     "(VS Number of URB Entries)
+    *      Range [24,256] in multiples of 4
+    *            [24, 128] in multiples of 4[DevSNBGT1]"
+    */
+   const int max_entry_count = (dev->gt == 2) ? 256 : 252;
+   const int row_size = 1024 / 8;
+   int row_count, entry_count;
+   int entry_size;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /* VE and VS share the same VUE for each vertex */
+   entry_size = info->vs_entry_size;
+   if (entry_size < info->ve_entry_size)
+      entry_size = info->ve_entry_size;
+
+   row_count = (entry_size + row_size - 1) / row_size;
+   if (row_count > 5)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   entry_count &= ~3;
+   assert(entry_count >= 24);
+
+   conf->vs_entry_rows = row_count;
+   conf->vs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen6_gs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 29:
+    *
+    *     "(GS Number of URB Entries)
+    *      Range [0,256] in multiples of 4
+    *            [0, 254] in multiples of 4[DevSNBGT1]"
+    *
+    *     "(GS URB Entry Allocation Size)
+    *      Range [0,4] = [1,5] 1024-bit URB rows"
+    */
+   const int max_entry_count = (dev->gt == 2) ? 256 : 252;
+   const int row_size = 1024 / 8;
+   int row_count, entry_count;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   row_count = (info->gs_entry_size + row_size - 1) / row_size;
+   if (row_count > 5)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   entry_count &= ~3;
+
+   conf->gs_entry_rows = row_count;
+   conf->gs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_vs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 34-35:
+    *
+    *     "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may
+    *      cause performance to decrease due to banking in the URB. Element
+    *      sizes of 16 to 20 should be programmed with six 512-bit URB rows."
+    *
+    *     "(VS URB Entry Allocation Size)
+    *      Format: U9-1 count of 512-bit units"
+    *
+    *     "(VS Number of URB Entries)
+    *      [32,704]
+    *      [32,512]
+    *
+    *      Programming Restriction: VS Number of URB Entries must be divisible
+    *      by 8 if the VS URB Entry Allocation Size is less than 9 512-bit URB
+    *      entries."2:0" = reserved "000b""
+    *
+    * From the Haswell PRM, volume 2b, page 847:
+    *
+    *     "(VS Number of URB Entries)
+    *      [64,1664] DevHSW:GT3
+    *      [64,1664] DevHSW:GT2
+    *      [32,640]  DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int entry_size;
+   int max_entry_count, min_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 35:
+    *
+    *     "Programming Restriction: As the VS URB entry serves as both the
+    *      per-vertex input and output of the VS shader, the VS URB Allocation
+    *      Size must be sized to the maximum of the vertex input and output
+    *      structures."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 42:
+    *
+    *     "If the VS function is enabled, the VF-written VUEs are not required
+    *      to have Vertex Headers, as the VS-incoming vertices are guaranteed
+    *      to be consumed by the VS (i.e., the VS thread is responsible for
+    *      overwriting the input vertex data)."
+    *
+    * VE and VS share the same VUE for each vertex.
+    */
+   entry_size = info->vs_entry_size;
+   if (entry_size < info->ve_entry_size)
+      entry_size = info->ve_entry_size;
+
+   row_count = (entry_size + row_size - 1) / row_size;
+   if (row_count == 5 || !row_count)
+      row_count++;
+
+   entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 1664 : 640;
+      min_entry_count = (dev->gt >= 2) ? 64 : 32;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 704 : 512;
+      min_entry_count = 32;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (entry_count < min_entry_count)
+      return false;
+
+   conf->vs_entry_rows = row_count;
+   conf->vs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_hs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 37:
+    *
+    *     "HS Number of URB Entries must be divisible by 8 if the HS URB Entry
+    *      Allocation Size is less than 9 512-bit URB
+    *      entries."2:0" = reserved "000"
+    *
+    *      [0,64]
+    *      [0,32]"
+    *
+    * From the Haswell PRM, volume 2b, page 849:
+    *
+    *     "(HS Number of URB Entries)
+    *      [0,128] DevHSW:GT2
+    *      [0,64]  DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->hs_entry_size + row_size - 1) / row_size;
+   if (!row_count)
+      row_count++;
+
+   entry_count = conf->hs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 128 : 64;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 64 : 32;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->hs_entry_size && !entry_count)
+      return false;
+
+   conf->hs_entry_rows = row_count;
+   conf->hs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_ds_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 38:
+    *
+    *     "(DS URB Entry Allocation Size)
+    *      [0,9]"
+    *
+    *     "(DS Number of URB Entries) If Domain Shader Thread Dispatch is
+    *      Enabled then the minimum number handles that must be allocated is
+    *      138 URB entries.
+    *      "2:0" = reserved "000"
+    *
+    *      [0,448]
+    *      [0,288]
+    *
+    *      DS Number of URB Entries must be divisible by 8 if the DS URB Entry
+    *      Allocation Size is less than 9 512-bit URB entries.If Domain Shader
+    *      Thread Dispatch is Enabled then the minimum number of handles that
+    *      must be allocated is 10 URB entries."
+    *
+    * From the Haswell PRM, volume 2b, page 851:
+    *
+    *     "(DS Number of URB Entries)
+    *      [0,960] DevHSW:GT2
+    *      [0,384] DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->ds_entry_size + row_size - 1) / row_size;
+   if (row_count > 10)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->ds_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 960 : 384;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 448 : 288;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->ds_entry_size && entry_count < 10)
+      return false;
+
+   conf->ds_entry_rows = row_count;
+   conf->ds_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_gs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 40:
+    *
+    *     "(GS Number of URB Entries) GS Number of URB Entries must be
+    *      divisible by 8 if the GS URB Entry Allocation Size is less than 9
+    *      512-bit URB entries.
+    *      "2:0" = reserved "000"
+    *
+    *      [0,320]
+    *      [0,192]"
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 171:
+    *
+    *     "(DUAL_INSTANCE and DUAL_OBJECT) The GS must be allocated at least
+    *      two URB handles or behavior is UNDEFINED."
+    *
+    * From the Haswell PRM, volume 2b, page 853:
+    *
+    *     "(GS Number of URB Entries)
+    *      [0,640] DevHSW:GT2
+    *      [0,256] DevHSW:GT1
+    *
+    *      Only if GS is disabled can this field be programmed to 0.  If GS is
+    *      enabled this field shall be programmed to a value greater than 0.
+    *      For GS Dispatch Mode "Single", this field shall be programmed to a
+    *      value greater than or equal to 1. For other GS Dispatch Modes,
+    *      refer to the definition of Dispatch Mode (3DSTATE_GS) for minimum
+    *      values of this field."
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->gs_entry_size + row_size - 1) / row_size;
+   if (!row_count)
+      row_count++;
+
+   entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 640 : 256;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 320 : 192;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->gs_entry_size && entry_count < 2)
+      return false;
+
+   conf->gs_entry_rows = row_count;
+   conf->gs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_get_gen6_configuration(const struct ilo_dev *dev,
+                           const struct ilo_state_urb_info *info,
+                           struct urb_configuration *conf)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   memset(conf, 0, sizeof(*conf));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      urb_alloc_gen7_pcb(dev, info, conf);
+
+   urb_alloc_gen6_urb(dev, info, conf);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      if (!urb_init_gen7_vs_entry(dev, info, conf) ||
+          !urb_init_gen7_hs_entry(dev, info, conf) ||
+          !urb_init_gen7_ds_entry(dev, info, conf) ||
+          !urb_init_gen7_gs_entry(dev, info, conf))
+         return false;
+   } else {
+      if (!urb_init_gen6_vs_entry(dev, info, conf) ||
+          !urb_init_gen6_gs_entry(dev, info, conf))
+         return false;
+   }
+
+   return true;
+}
+
+static bool
+urb_set_gen7_3dstate_push_constant_alloc(struct ilo_state_urb *urb,
+                                         const struct ilo_dev *dev,
+                                         const struct ilo_state_urb_info *info,
+                                         const struct urb_configuration *conf)
+{
+   uint32_t dw1[5];
+   uint8_t sizes_kb[5], offset_kb;
+   int i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   sizes_kb[0] = conf->vs_pcb_alloc_kb;
+   sizes_kb[1] = conf->hs_pcb_alloc_kb;
+   sizes_kb[2] = conf->ds_pcb_alloc_kb;
+   sizes_kb[3] = conf->gs_pcb_alloc_kb;
+   sizes_kb[4] = conf->ps_pcb_alloc_kb;
+   offset_kb = 0;
+
+   for (i = 0; i < 5; i++) {
+      /* careful for the valid range of offsets */
+      if (sizes_kb[i]) {
+         dw1[i] = offset_kb << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
+                  sizes_kb[i] << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;
+         offset_kb += sizes_kb[i];
+      } else {
+         dw1[i] = 0;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->pcb) >= 5);
+   memcpy(urb->pcb, dw1, sizeof(dw1));
+
+   return true;
+}
+
+static bool
+urb_set_gen6_3DSTATE_URB(struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_urb_info *info,
+                         const struct urb_configuration *conf)
+{
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   assert(conf->vs_entry_rows && conf->gs_entry_rows);
+
+   dw1 = (conf->vs_entry_rows - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT |
+         conf->vs_entry_count << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT;
+   dw2 = conf->gs_entry_count << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT |
+         (conf->gs_entry_rows - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 2);
+   urb->urb[0] = dw1;
+   urb->urb[1] = dw2;
+
+   return true;
+}
+
+static bool
+urb_set_gen7_3dstate_urb(struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_urb_info *info,
+                         const struct urb_configuration *conf)
+{
+   uint32_t dw1[4];
+   struct {
+      uint8_t alloc_8kb;
+      uint8_t entry_rows;
+      int entry_count;
+   } stages[4];
+   uint8_t offset_8kb;
+   int i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   stages[0].alloc_8kb = conf->vs_urb_alloc_8kb;
+   stages[1].alloc_8kb = conf->hs_urb_alloc_8kb;
+   stages[2].alloc_8kb = conf->ds_urb_alloc_8kb;
+   stages[3].alloc_8kb = conf->gs_urb_alloc_8kb;
+
+   stages[0].entry_rows = conf->vs_entry_rows;
+   stages[1].entry_rows = conf->hs_entry_rows;
+   stages[2].entry_rows = conf->ds_entry_rows;
+   stages[3].entry_rows = conf->gs_entry_rows;
+
+   stages[0].entry_count = conf->vs_entry_count;
+   stages[1].entry_count = conf->hs_entry_count;
+   stages[2].entry_count = conf->ds_entry_count;
+   stages[3].entry_count = conf->gs_entry_count;
+
+   offset_8kb = conf->urb_offset_8kb;
+
+   for (i = 0; i < 4; i++) {
+      /* careful for the valid range of offsets */
+      if (stages[i].alloc_8kb) {
+         assert(stages[i].entry_rows);
+         dw1[i] =
+            offset_8kb << GEN7_URB_DW1_OFFSET__SHIFT |
+            (stages[i].entry_rows - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT |
+            stages[i].entry_count << GEN7_URB_DW1_ENTRY_COUNT__SHIFT;
+         offset_8kb += stages[i].alloc_8kb;
+      } else {
+         dw1[i] = 0;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 4);
+   memcpy(urb->urb, dw1, sizeof(dw1));
+
+   return true;
+}
+
+bool
+ilo_state_urb_init(struct ilo_state_urb *urb,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info)
+{
+   assert(ilo_is_zeroed(urb, sizeof(*urb)));
+   return ilo_state_urb_set_info(urb, dev, info);
+}
+
+bool
+ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb,
+                                const struct ilo_dev *dev,
+                                uint8_t vf_attr_count)
+{
+   struct ilo_state_urb_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.ve_entry_size = sizeof(uint32_t) * 4 * vf_attr_count;
+
+   return ilo_state_urb_init(urb, dev, &info);
+}
+
+bool
+ilo_state_urb_set_info(struct ilo_state_urb *urb,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info)
+{
+   struct urb_configuration conf;
+   bool ret = true;
+
+   ret &= urb_get_gen6_configuration(dev, info, &conf);
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= urb_set_gen7_3dstate_push_constant_alloc(urb, dev, info, &conf);
+      ret &= urb_set_gen7_3dstate_urb(urb, dev, info, &conf);
+   } else {
+      ret &= urb_set_gen6_3DSTATE_URB(urb, dev, info, &conf);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_urb_full_delta(const struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         struct ilo_state_urb_delta *delta)
+{
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      delta->dirty = ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS |
+                     ILO_STATE_URB_3DSTATE_URB_VS |
+                     ILO_STATE_URB_3DSTATE_URB_HS |
+                     ILO_STATE_URB_3DSTATE_URB_DS |
+                     ILO_STATE_URB_3DSTATE_URB_GS;
+   } else {
+      delta->dirty = ILO_STATE_URB_3DSTATE_URB_VS |
+                     ILO_STATE_URB_3DSTATE_URB_GS;
+   }
+}
+
+void
+ilo_state_urb_get_delta(const struct ilo_state_urb *urb,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_urb *old,
+                        struct ilo_state_urb_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      if (memcmp(urb->pcb, old->pcb, sizeof(urb->pcb))) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS;
+      }
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 34:
+       *
+       *     "3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
+       *      programmed in order for the programming of this state
+       *      (3DSTATE_URB_VS) to be valid."
+       *
+       * The same is true for the other three states.
+       */
+      if (memcmp(urb->urb, old->urb, sizeof(urb->urb))) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS |
+                         ILO_STATE_URB_3DSTATE_URB_HS |
+                         ILO_STATE_URB_3DSTATE_URB_DS |
+                         ILO_STATE_URB_3DSTATE_URB_GS;
+      }
+   } else {
+      if (memcmp(urb->urb, old->urb, sizeof(uint32_t) * 2)) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS |
+                         ILO_STATE_URB_3DSTATE_URB_GS;
+      }
+   }
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.h b/src/gallium/drivers/ilo/core/ilo_state_urb.h
new file mode 100644
index 00000000000..9522b3bd681
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_urb.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_URB_H
+#define ILO_STATE_URB_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+enum ilo_state_urb_dirty_bits {
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS = (1 << 0),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS = (1 << 1),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS = (1 << 2),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS = (1 << 3),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS = (1 << 4),
+   ILO_STATE_URB_3DSTATE_URB_VS                 = (1 << 5),
+   ILO_STATE_URB_3DSTATE_URB_HS                 = (1 << 6),
+   ILO_STATE_URB_3DSTATE_URB_DS                 = (1 << 7),
+   ILO_STATE_URB_3DSTATE_URB_GS                 = (1 << 8),
+};
+
+/**
+ * URB entry allocation sizes and sizes of constant data extracted from PCBs
+ * to threads.
+ */
+struct ilo_state_urb_info {
+   bool gs_enable;
+
+   bool vs_const_data;
+   bool hs_const_data;
+   bool ds_const_data;
+   bool gs_const_data;
+   bool ps_const_data;
+
+   uint16_t ve_entry_size;
+   uint16_t vs_entry_size;
+   uint16_t hs_entry_size;
+   uint16_t ds_entry_size;
+   uint16_t gs_entry_size;
+};
+
+struct ilo_state_urb {
+   uint32_t pcb[5];
+   uint32_t urb[4];
+};
+
+struct ilo_state_urb_delta {
+   uint32_t dirty;
+};
+
+bool
+ilo_state_urb_init(struct ilo_state_urb *urb,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info);
+
+bool
+ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb,
+                                const struct ilo_dev *dev,
+                                uint8_t vf_attr_count);
+
+bool
+ilo_state_urb_set_info(struct ilo_state_urb *urb,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info);
+
+void
+ilo_state_urb_full_delta(const struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         struct ilo_state_urb_delta *delta);
+
+void
+ilo_state_urb_get_delta(const struct ilo_state_urb *urb,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_urb *old,
+                        struct ilo_state_urb_delta *delta);
+
+#endif /* ILO_STATE_URB_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
new file mode 100644
index 00000000000..ddc75428ed7
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -0,0 +1,984 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_state_vf.h"
+
+static bool
+vf_validate_gen6_elements(const struct ilo_dev *dev,
+                          const struct ilo_state_vf_info *info)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 95:
+    *
+    *     "(Source Element Offset (in bytes))
+    *      Format: U11
+    *      Range [0,2047"
+    *
+    * From the Haswell PRM, volume 2d, page 415:
+    *
+    *     "(Source Element Offset)
+    *      Format: U12 byte offset
+    *      ...
+    *      [0,4095]"
+    *
+    * From the Broadwell PRM, volume 2d, page 469:
+    *
+    *     "(Source Element Offset)
+    *      Format: U12 byte offset
+    *      ...
+    *      [0,2047]"
+    */
+   const uint16_t max_vertex_offset =
+      (ilo_dev_gen(dev) == ILO_GEN(7.5)) ? 4096 : 2048;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->element_count <= ILO_STATE_VF_MAX_ELEMENT_COUNT);
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      assert(elem->buffer < ILO_STATE_VF_MAX_BUFFER_COUNT);
+      assert(elem->vertex_offset < max_vertex_offset);
+      assert(ilo_state_vf_valid_element_format(dev, elem->format));
+   }
+
+   return true;
+}
+
+static uint32_t
+get_gen6_component_controls(const struct ilo_dev *dev,
+                            enum gen_vf_component comp_x,
+                            enum gen_vf_component comp_y,
+                            enum gen_vf_component comp_z,
+                            enum gen_vf_component comp_w)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return comp_x << GEN6_VE_DW1_COMP0__SHIFT |
+          comp_y << GEN6_VE_DW1_COMP1__SHIFT |
+          comp_z << GEN6_VE_DW1_COMP2__SHIFT |
+          comp_w << GEN6_VE_DW1_COMP3__SHIFT;
+}
+
+static bool
+get_gen6_edge_flag_format(const struct ilo_dev *dev,
+                          const struct ilo_state_vf_element_info *elem,
+                          enum gen_surface_format *format)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+    *
+    *     "The Source Element Format must be set to the UINT format."
+    *
+    * From the Haswell PRM, volume 2d, page 413:
+    *
+    *     "The SourceElementFormat needs to be a single-component format with
+    *      an element which has edge flag enabled."
+    */
+   if (elem->component_count != 1)
+      return false;
+
+   /* pick the format we like */
+   switch (elem->format_size) {
+   case 1:
+      *format = GEN6_FORMAT_R8_UINT;
+      break;
+   case 2:
+      *format = GEN6_FORMAT_R16_UINT;
+      break;
+   case 4:
+      *format = GEN6_FORMAT_R32_UINT;
+      break;
+   default:
+      return false;
+      break;
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_state_vf *vf,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_vf_info *info)
+{
+   enum gen_surface_format edge_flag_format;
+   uint32_t dw0, dw1;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vf_validate_gen6_elements(dev, info))
+      return false;
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+      enum gen_vf_component components[4] = {
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         (elem->is_integer) ? GEN6_VFCOMP_STORE_1_INT :
+                              GEN6_VFCOMP_STORE_1_FP,
+      };
+
+      switch (elem->component_count) {
+      case 4: components[3] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 3: components[2] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 2: components[1] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 1: components[0] = GEN6_VFCOMP_STORE_SRC; break;
+      default:
+              assert(!"unexpected component count");
+              break;
+      }
+
+      dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT |
+            GEN6_VE_DW0_VALID |
+            elem->format << GEN6_VE_DW0_FORMAT__SHIFT |
+            elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
+      dw1 = get_gen6_component_controls(dev,
+            components[0], components[1],
+            components[2], components[3]);
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_ve[i]) >= 2);
+      vf->user_ve[i][0] = dw0;
+      vf->user_ve[i][1] = dw1;
+   }
+
+   vf->user_ve_count = i;
+
+   vf->edge_flag_supported = (i && get_gen6_edge_flag_format(dev,
+         &info->elements[i - 1], &edge_flag_format));
+   if (vf->edge_flag_supported) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i - 1];
+
+      /* without edge flag enable */
+      vf->last_user_ve[0][0] = dw0;
+      vf->last_user_ve[0][1] = dw1;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+       *
+       *     "This bit (Edge Flag Enable) must only be ENABLED on the last
+       *      valid VERTEX_ELEMENT structure.
+       *
+       *      When set, Component 0 Control must be set to
+       *      VFCOMP_STORE_SRC, and Component 1-3 Control must be set to
+       *      VFCOMP_NOSTORE."
+       */
+      dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT |
+            GEN6_VE_DW0_VALID |
+            edge_flag_format << GEN6_VE_DW0_FORMAT__SHIFT |
+            GEN6_VE_DW0_EDGE_FLAG_ENABLE |
+            elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
+      dw1 = get_gen6_component_controls(dev, GEN6_VFCOMP_STORE_SRC,
+            GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE);
+
+      /* with edge flag enable */
+      vf->last_user_ve[1][0] = dw0;
+      vf->last_user_ve[1][1] = dw1;
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen6_vertex_buffer_state(struct ilo_state_vf *vf,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_vf_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   memset(vf->vb_to_first_elem, -1, sizeof(vf->vb_to_first_elem));
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2);
+      /* instancing enable only */
+      vf->user_instancing[i][0] = (elem->instancing_enable) ?
+         GEN6_VB_DW0_ACCESS_INSTANCEDATA :
+         GEN6_VB_DW0_ACCESS_VERTEXDATA;
+      vf->user_instancing[i][1] = elem->instancing_step_rate;
+
+      /*
+       * Instancing is per VB, not per VE, before Gen8.  Set up a VB-to-VE
+       * mapping as well.
+       */
+      if (vf->vb_to_first_elem[elem->buffer] < 0) {
+         vf->vb_to_first_elem[elem->buffer] = i;
+      } else {
+         const struct ilo_state_vf_element_info *first =
+            &info->elements[vf->vb_to_first_elem[elem->buffer]];
+
+         assert(elem->instancing_enable == first->instancing_enable &&
+                elem->instancing_step_rate == first->instancing_step_rate);
+      }
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen8_3DSTATE_VF_INSTANCING(struct ilo_state_vf *vf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_vf_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2);
+      vf->user_instancing[i][0] = (elem->instancing_enable) ?
+         GEN8_INSTANCING_DW1_ENABLE : 0;
+      vf->user_instancing[i][1] = elem->instancing_step_rate;
+   }
+
+   return true;
+}
+
+static uint32_t
+get_gen6_component_zeros(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_component_controls(dev,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0);
+}
+
+static uint32_t
+get_gen6_component_ids(const struct ilo_dev *dev,
+                       bool vertexid, bool instanceid)
+{
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   return get_gen6_component_controls(dev,
+      (vertexid) ? GEN6_VFCOMP_STORE_VID : GEN6_VFCOMP_STORE_0,
+      (instanceid) ? GEN6_VFCOMP_STORE_IID : GEN6_VFCOMP_STORE_0,
+      GEN6_VFCOMP_STORE_0,
+      GEN6_VFCOMP_STORE_0);
+}
+
+static bool
+vf_params_set_gen6_internal_ve(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_vf_params_info *params,
+                               uint8_t user_ve_count)
+{
+   const bool prepend_ids =
+      (params->prepend_vertexid || params->prepend_instanceid);
+   uint8_t internal_ve_count = 0, i;
+   uint32_t dw1[2];
+
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 92:
+    *
+    *     "- At least one VERTEX_ELEMENT_STATE structure must be included.
+    *
+    *      - Inclusion of partial VERTEX_ELEMENT_STATE structures is
+    *        UNDEFINED.
+    *
+    *      - SW must ensure that at least one vertex element is defined prior
+    *        to issuing a 3DPRIMTIVE command, or operation is UNDEFINED.
+    *
+    *      - There are no "holes" allowed in the destination vertex: NOSTORE
+    *        components must be overwritten by subsequent components unless
+    *        they are the trailing DWords of the vertex.  Software must
+    *        explicitly chose some value (probably 0) to be written into
+    *        DWords that would otherwise be "holes"."
+    *
+    *      - ...
+    *
+    *      - [DevILK+] Element[0] must be valid."
+    */
+   if (params->prepend_zeros || (!user_ve_count && !prepend_ids))
+      dw1[internal_ve_count++] = get_gen6_component_zeros(dev);
+
+   if (prepend_ids) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         /* placeholder for 3DSTATE_VF_SGVS */
+         dw1[internal_ve_count++] = get_gen6_component_zeros(dev);
+      } else {
+         dw1[internal_ve_count++] = get_gen6_component_ids(dev,
+               params->prepend_vertexid, params->prepend_instanceid);
+      }
+   }
+
+   for (i = 0; i < internal_ve_count; i++) {
+      STATIC_ASSERT(ARRAY_SIZE(vf->internal_ve[i]) >= 2);
+      vf->internal_ve[i][0] = GEN6_VE_DW0_VALID;
+      vf->internal_ve[i][1] = dw1[i];
+   }
+
+   vf->internal_ve_count = internal_ve_count;
+
+   return true;
+}
+
+static bool
+vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf,
+                                   const struct ilo_dev *dev,
+                                   const struct ilo_state_vf_params_info *params)
+{
+   const uint8_t attr = (params->prepend_zeros) ? 1 : 0;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (params->prepend_instanceid) {
+      dw1 |= GEN8_SGVS_DW1_IID_ENABLE |
+             1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT;
+   }
+
+   if (params->prepend_vertexid) {
+      dw1 |= GEN8_SGVS_DW1_VID_ENABLE |
+             0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1);
+   vf->sgvs[0] = dw1;
+
+   return true;
+}
+
+static uint32_t
+get_gen6_fixed_cut_index(const struct ilo_dev *dev,
+                         enum gen_index_format format)
+{
+   const uint32_t fixed = ~0u;
+
+   ILO_DEV_ASSERT(dev, 6, 7);
+
+   switch (format) {
+   case GEN6_INDEX_BYTE:   return (uint8_t)  fixed;
+   case GEN6_INDEX_WORD:   return (uint16_t) fixed;
+   case GEN6_INDEX_DWORD:  return (uint32_t) fixed;
+   default:
+      assert(!"unknown index format");
+      return fixed;
+   }
+}
+
+static bool
+get_gen6_cut_index_supported(const struct ilo_dev *dev,
+                             enum gen_3dprim_type topology)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * See the Sandy Bridge PRM, volume 2 part 1, page 80 and the Haswell PRM,
+    * volume 7, page 456.
+    */
+   switch (topology) {
+   case GEN6_3DPRIM_TRIFAN:
+   case GEN6_3DPRIM_QUADLIST:
+   case GEN6_3DPRIM_QUADSTRIP:
+   case GEN6_3DPRIM_POLYGON:
+   case GEN6_3DPRIM_LINELOOP:
+      return (ilo_dev_gen(dev) >= ILO_GEN(7.5));
+   case GEN6_3DPRIM_RECTLIST:
+   case GEN6_3DPRIM_TRIFAN_NOSTIPPLE:
+      return false;
+   default:
+      return true;
+   }
+}
+
+static bool
+vf_params_set_gen6_3dstate_index_buffer(struct ilo_state_vf *vf,
+                                        const struct ilo_dev *dev,
+                                        const struct ilo_state_vf_params_info *params)
+{
+   uint32_t dw0 = 0;
+
+   ILO_DEV_ASSERT(dev, 6, 7);
+
+   /* cut index only, as in 3DSTATE_VF */
+   if (params->cut_index_enable) {
+      assert(get_gen6_cut_index_supported(dev, params->cv_topology));
+      assert(get_gen6_fixed_cut_index(dev, params->cv_index_format) ==
+            params->cut_index);
+
+      dw0 |= GEN6_IB_DW0_CUT_INDEX_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 1);
+   vf->cut[0] = dw0;
+
+   return true;
+}
+
+static bool
+vf_params_set_gen75_3DSTATE_VF(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_vf_params_info *params)
+{
+   uint32_t dw0 = 0;
+
+   ILO_DEV_ASSERT(dev, 7.5, 8);
+
+   if (params->cut_index_enable) {
+      assert(get_gen6_cut_index_supported(dev, params->cv_topology));
+      dw0 |= GEN75_VF_DW0_CUT_INDEX_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 2);
+   vf->cut[0] = dw0;
+   vf->cut[1] = params->cut_index;
+
+   return true;
+}
+
+static bool
+vertex_buffer_validate_gen6(const struct ilo_dev *dev,
+                            const struct ilo_state_vertex_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "(Buffer Pitch)
+    *      Range  [DevCTG+]: [0,2048] Bytes"
+    */
+   assert(info->stride <= 2048);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "64-bit floating point values must be 64-bit aligned in memory, or
+    *      UNPREDICTABLE data will be fetched. When accessing an element
+    *      containing 64-bit floating point values, the Buffer Starting
+    *      Address and Source Element Offset values must add to a 64-bit
+    *      aligned address, and BufferPitch must be a multiple of 64-bits."
+    */
+   if (info->cv_has_double) {
+      assert(info->stride % 8 == 0);
+      assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0);
+   }
+
+   return true;
+}
+
+static uint32_t
+vertex_buffer_get_gen6_size(const struct ilo_dev *dev,
+                            const struct ilo_state_vertex_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   return (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+}
+
+static bool
+vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_vertex_buffer_info *info)
+{
+   const uint32_t size = vertex_buffer_get_gen6_size(dev, info);
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_buffer_validate_gen6(dev, info))
+      return false;
+
+   dw0 = info->stride << GEN6_VB_DW0_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      dw0 |= GEN7_VB_DW0_ADDR_MODIFIED;
+   if (!info->buf)
+      dw0 |= GEN6_VB_DW0_IS_NULL;
+
+   STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3);
+   vb->vb[0] = dw0;
+   vb->vb[1] = info->offset;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      vb->vb[2] = size;
+   } else {
+      /* address of the last valid byte */
+      vb->vb[2] = (size) ? info->offset + size - 1 : 0;
+   }
+
+   vb->need_bo = (info->buf != NULL);
+
+   return true;
+}
+
+static uint32_t
+get_index_format_size(enum gen_index_format format)
+{
+   switch (format) {
+   case GEN6_INDEX_BYTE:   return 1;
+   case GEN6_INDEX_WORD:   return 2;
+   case GEN6_INDEX_DWORD:  return 4;
+   default:
+      assert(!"unknown index format");
+      return 1;
+   }
+}
+
+static bool
+index_buffer_validate_gen6(const struct ilo_dev *dev,
+                           const struct ilo_state_index_buffer_info *info)
+{
+   const uint32_t format_size = get_index_format_size(info->format);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 79:
+    *
+    *     "This field (Buffer Starting Address) contains the size-aligned (as
+    *      specified by Index Format) Graphics Address of the first element of
+    *      interest within the index buffer."
+    */
+   assert(info->offset % format_size == 0);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   return true;
+}
+
+static uint32_t
+index_buffer_get_gen6_size(const struct ilo_dev *dev,
+                           const struct ilo_state_index_buffer_info *info)
+{
+   uint32_t size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      const uint32_t format_size = get_index_format_size(info->format);
+      size -= (size % format_size);
+   }
+
+   return size;
+}
+
+static bool
+index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_index_buffer_info *info)
+{
+   const uint32_t size = index_buffer_get_gen6_size(dev, info);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!index_buffer_validate_gen6(dev, info))
+      return false;
+
+   STATIC_ASSERT(ARRAY_SIZE(ib->ib) >= 3);
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ib->ib[0] = info->format << GEN8_IB_DW1_FORMAT__SHIFT;
+      ib->ib[1] = info->offset;
+      ib->ib[2] = size;
+   } else {
+      ib->ib[0] = info->format << GEN6_IB_DW0_FORMAT__SHIFT;
+      ib->ib[1] = info->offset;
+      /* address of the last valid byte, or 0 */
+      ib->ib[2] = (size) ? info->offset + size - 1 : 0;
+   }
+
+   ib->need_bo = (info->buf != NULL);
+
+   return true;
+}
+
+bool
+ilo_state_vf_valid_element_format(const struct ilo_dev *dev,
+                                  enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 2 part 1, page 97-99
+    *  - the Haswell PRM, volume 7, page 467-470
+    */
+   static const int vf_element_formats[] = {
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_UNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SSCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_USCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SFIXED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R32G32B32_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SINT]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_UINT]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_UNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SSCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_USCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SFIXED]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_UINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R64_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SSCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_USCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SFIXED]            = ILO_GEN(7.5),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_UINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_UINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_UNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10X2_USCALED]      = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SSCALED]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_USCALED]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SSCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_USCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_UNORM]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SNORM]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SINT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_UINT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_UNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_UINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SSCALED]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_USCALED]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SSCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_USCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_UNORM]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SNORM]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SINT]                  = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_UINT]                  = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SSCALED]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_USCALED]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64B64A64_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64B64_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_FLOAT]          = ILO_GEN(  6),
+      [GEN6_FORMAT_R16G16B16_UNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_SNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_SSCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_USCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_UINT]           = ILO_GEN(7.5),
+      [GEN6_FORMAT_R16G16B16_SINT]           = ILO_GEN(7.5),
+      [GEN6_FORMAT_R32_SFIXED]               = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_USCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SSCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_USCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SSCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_UINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_R8G8B8_UINT]              = ILO_GEN(7.5),
+      [GEN6_FORMAT_R8G8B8_SINT]              = ILO_GEN(7.5),
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return (format < ARRAY_SIZE(vf_element_formats) &&
+           vf_element_formats[format] &&
+           ilo_dev_gen(dev) >= vf_element_formats[format]);
+}
+
+bool
+ilo_state_vf_init(struct ilo_state_vf *vf,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vf_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(vf, sizeof(*vf)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   assert(ilo_state_vf_data_size(dev, info->element_count) <=
+         info->data_size);
+   vf->user_ve = (uint32_t (*)[2]) info->data;
+   vf->user_instancing =
+      (uint32_t (*)[2]) (vf->user_ve + info->element_count);
+
+   ret &= vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(vf, dev, info);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= vf_set_gen8_3DSTATE_VF_INSTANCING(vf, dev, info);
+   else
+      ret &= vf_set_gen6_vertex_buffer_state(vf, dev, info);
+
+   ret &= ilo_state_vf_set_params(vf, dev, &info->params);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               void *data, size_t data_size,
+                               const struct ilo_state_vf_element_info *elements,
+                               uint8_t element_count)
+{
+   struct ilo_state_vf_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   info.data = data;
+   info.data_size = data_size;
+
+   info.elements = elements;
+   info.element_count = element_count;
+
+   /*
+    * For VUE header,
+    *
+    *   DW0: Reserved: MBZ
+    *   DW1: Render Target Array Index
+    *   DW2: Viewport Index
+    *   DW3: Point Width
+    */
+   info.params.prepend_zeros = true;
+
+   return ilo_state_vf_init(vf, dev, &info);
+}
+
+bool
+ilo_state_vf_set_params(struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_vf_params_info *params)
+{
+   bool ret = true;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ret &= vf_params_set_gen6_internal_ve(vf, dev, params, vf->user_ve_count);
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= vf_params_set_gen8_3DSTATE_VF_SGVS(vf, dev, params);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+    *
+    *     "Edge flags are supported for the following primitive topology types
+    *      only, otherwise EdgeFlagEnable must not be ENABLED.
+    *
+    *      - 3DPRIM_TRILIST*
+    *      - 3DPRIM_TRISTRIP*
+    *      - 3DPRIM_TRIFAN*
+    *      - 3DPRIM_POLYGON"
+    *
+    *     "[DevSNB]: Edge Flags are not supported for QUADLIST primitives.
+    *      Software may elect to convert QUADLIST primitives to some set of
+    *      corresponding edge-flag-supported primitive types (e.g., POLYGONs)
+    *      prior to submission to the 3D vf."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "Edge flags are supported for all primitive topology types."
+    *
+    * Both PRMs are confusing...
+    */
+   if (params->last_element_edge_flag) {
+      assert(vf->edge_flag_supported);
+      if (ilo_dev_gen(dev) == ILO_GEN(6))
+         assert(params->cv_topology != GEN6_3DPRIM_QUADLIST);
+   }
+
+   if (vf->edge_flag_supported) {
+      assert(vf->user_ve_count);
+      memcpy(vf->user_ve[vf->user_ve_count - 1],
+            vf->last_user_ve[params->last_element_edge_flag],
+            sizeof(vf->user_ve[vf->user_ve_count - 1]));
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      ret &= vf_params_set_gen75_3DSTATE_VF(vf, dev, params);
+   else
+      ret &= vf_params_set_gen6_3dstate_index_buffer(vf, dev, params);
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_vf_full_delta(const struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_vf_delta *delta)
+{
+   delta->dirty = ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS |
+                      ILO_STATE_VF_3DSTATE_VF_INSTANCING;
+   } else {
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VF;
+   else
+      delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER;
+}
+
+void
+ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vf *old,
+                       struct ilo_state_vf_delta *delta)
+{
+   /* no shallow copying */
+   assert(vf->user_ve != old->user_ve &&
+          vf->user_instancing != old->user_instancing);
+
+   delta->dirty = 0;
+
+   if (vf->internal_ve_count != old->internal_ve_count ||
+       vf->user_ve_count != old->user_ve_count ||
+       memcmp(vf->internal_ve, old->internal_ve,
+          sizeof(vf->internal_ve[0]) * vf->internal_ve_count) ||
+       memcmp(vf->user_ve, old->user_ve,
+          sizeof(vf->user_ve[0]) * vf->user_ve_count))
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS;
+
+   if (vf->user_ve_count != old->user_ve_count ||
+       memcmp(vf->user_instancing, old->user_instancing,
+          sizeof(vf->user_instancing[0]) * vf->user_ve_count)) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF_INSTANCING;
+      else
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      if (vf->sgvs[0] != old->sgvs[0])
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      if (memcmp(vf->cut, old->cut, sizeof(vf->cut)))
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF;
+   } else {
+      if (vf->cut[0] != old->cut[0])
+         delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER;
+   }
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_vertex_buffer_info *info)
+{
+   bool ret = true;
+
+   ret &= vertex_buffer_set_gen8_vertex_buffer_state(vb, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_index_buffer_info *info)
+{
+   bool ret = true;
+
+   ret &= index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(ib, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
new file mode 100644
index 00000000000..f15c63a248a
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -0,0 +1,228 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_VF_H
+#define ILO_STATE_VF_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 93:
+ *
+ *     "Up to 34 (DevSNB+) vertex elements are supported."
+ *
+ *     "Up to 33 VBs are supported"
+ *
+ * Reserve two VEs and one VB for internal use.
+ */
+#define ILO_STATE_VF_MAX_ELEMENT_COUNT (34 - 2)
+#define ILO_STATE_VF_MAX_BUFFER_COUNT (33 - 1)
+
+enum ilo_state_vf_dirty_bits {
+   ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS            = (1 << 0),
+   ILO_STATE_VF_3DSTATE_VF_SGVS                    = (1 << 1),
+   ILO_STATE_VF_3DSTATE_VF_INSTANCING              = (1 << 2),
+   ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS             = (1 << 3),
+   ILO_STATE_VF_3DSTATE_VF                         = (1 << 4),
+   ILO_STATE_VF_3DSTATE_INDEX_BUFFER               = (1 << 5),
+};
+
+/**
+ * Fetch a 128-bit vertex attribute.
+ */
+struct ilo_state_vf_element_info {
+   uint8_t buffer;
+   uint16_t vertex_offset;
+   enum gen_surface_format format;
+
+   uint8_t format_size;
+   uint8_t component_count;
+   bool is_integer;
+
+   /* must be the same for those share the same buffer before Gen8 */
+   bool instancing_enable;
+   uint32_t instancing_step_rate;
+};
+
+/**
+ * VF parameters.
+ */
+struct ilo_state_vf_params_info {
+   enum gen_3dprim_type cv_topology;
+
+   /* prepend an attribute of zeros */
+   bool prepend_zeros;
+
+   /* prepend an attribute of VertexID and/or InstanceID */
+   bool prepend_vertexid;
+   bool prepend_instanceid;
+
+   bool last_element_edge_flag;
+
+   enum gen_index_format cv_index_format;
+   bool cut_index_enable;
+   uint32_t cut_index;
+};
+
+/**
+ * Vertex fetch.
+ */
+struct ilo_state_vf_info {
+   void *data;
+   size_t data_size;
+
+   const struct ilo_state_vf_element_info *elements;
+   uint8_t element_count;
+
+   struct ilo_state_vf_params_info params;
+};
+
+struct ilo_state_vf {
+   uint32_t (*user_ve)[2];
+   uint32_t (*user_instancing)[2];
+   int8_t vb_to_first_elem[ILO_STATE_VF_MAX_BUFFER_COUNT];
+   uint8_t user_ve_count;
+
+   bool edge_flag_supported;
+   uint32_t last_user_ve[2][2];
+
+   /* two VEs are reserved for internal use */
+   uint32_t internal_ve[2][2];
+   uint8_t internal_ve_count;
+
+   uint32_t sgvs[1];
+
+   uint32_t cut[2];
+};
+
+struct ilo_state_vf_delta {
+   uint32_t dirty;
+};
+
+struct ilo_buffer;
+
+struct ilo_state_vertex_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   uint16_t stride;
+
+   /* doubles must be at 64-bit aligned addresses */
+   bool cv_has_double;
+   uint8_t cv_double_vertex_offset_mod_8;
+};
+
+struct ilo_state_vertex_buffer {
+   uint32_t vb[3];
+
+   bool need_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+struct ilo_state_index_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   enum gen_index_format format;
+};
+
+struct ilo_state_index_buffer {
+   uint32_t ib[3];
+
+   bool need_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+static inline size_t
+ilo_state_vf_data_size(const struct ilo_dev *dev, uint8_t element_count)
+{
+   const struct ilo_state_vf *vf = NULL;
+   return (sizeof(vf->user_ve[0]) +
+           sizeof(vf->user_instancing[0])) * element_count;
+}
+
+bool
+ilo_state_vf_valid_element_format(const struct ilo_dev *dev,
+                                  enum gen_surface_format format);
+
+bool
+ilo_state_vf_init(struct ilo_state_vf *vf,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vf_info *info);
+
+bool
+ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               void *data, size_t data_size,
+                               const struct ilo_state_vf_element_info *elements,
+                               uint8_t element_count);
+
+bool
+ilo_state_vf_set_params(struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_vf_params_info *params);
+
+/**
+ * Return the number of attributes in the VUE.
+ */
+static inline uint8_t
+ilo_state_vf_get_attr_count(const struct ilo_state_vf *vf)
+{
+   return vf->internal_ve_count + vf->user_ve_count;
+}
+
+void
+ilo_state_vf_full_delta(const struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_vf_delta *delta);
+
+void
+ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vf *old,
+                       struct ilo_state_vf_delta *delta);
+
+bool
+ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_vertex_buffer_info *info);
+
+bool
+ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_index_buffer_info *info);
+
+#endif /* ILO_STATE_VF_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.c b/src/gallium/drivers/ilo/core/ilo_state_viewport.c
new file mode 100644
index 00000000000..aae57334541
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.c
@@ -0,0 +1,378 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_viewport.h"
+
+static void
+viewport_matrix_get_gen6_guardband(const struct ilo_dev *dev,
+                                   const struct ilo_state_viewport_matrix_info *mat,
+                                   float *min_gbx, float *max_gbx,
+                                   float *min_gby, float *max_gby)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 234:
+    *
+    *     "Per-Device Guardband Extents
+    *
+    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1]
+    *       - Maximum Post-Clamp Delta (X or Y): 16K"
+    *
+    *     "In addition, in order to be correctly rendered, objects must have a
+    *      screenspace bounding box not exceeding 8K in the X or Y direction.
+    *      This additional restriction must also be comprehended by software,
+    *      i.e., enforced by use of clipping."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "Per-Device Guardband Extents
+    *
+    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1]
+    *       - Maximum Post-Clamp Delta (X or Y): N/A"
+    *
+    *     "In addition, in order to be correctly rendered, objects must have a
+    *      screenspace bounding box not exceeding 8K in the X or Y direction.
+    *      This additional restriction must also be comprehended by software,
+    *      i.e., enforced by use of clipping."
+    *
+    * Combined, the bounding box of any object can not exceed 8K in both
+    * width and height.
+    *
+    * Below we set the guardband as a squre of length 8K, centered at where
+    * the viewport is.  This makes sure all objects passing the GB test are
+    * valid to the renderer, and those failing the XY clipping have a
+    * better chance of passing the GB test.
+    */
+   const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384;
+   const int half_len = 8192 / 2;
+   int center_x = (int) mat->translate[0];
+   int center_y = (int) mat->translate[1];
+   float scale_x, scale_y;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* make sure the guardband is within the valid range */
+   if (center_x - half_len < -max_extent)
+      center_x = -max_extent + half_len;
+   else if (center_x + half_len > max_extent - 1)
+      center_x = max_extent - half_len;
+
+   if (center_y - half_len < -max_extent)
+      center_y = -max_extent + half_len;
+   else if (center_y + half_len > max_extent - 1)
+      center_y = max_extent - half_len;
+
+   scale_x = fabsf(mat->scale[0]);
+   scale_y = fabsf(mat->scale[1]);
+   /*
+    * From the Haswell PRM, volume 2d, page 292-293:
+    *
+    *     "Note: Minimum allowed value for this field (X/Y Min Clip Guardband)
+    *      is -16384."
+    *
+    *     "Note: Maximum allowed value for this field (X/Y Max Clip Guardband)
+    *      is 16383."
+    *
+    * Avoid small scales.
+    */
+   if (scale_x < 1.0f)
+      scale_x = 1.0f;
+   if (scale_y < 1.0f)
+      scale_y = 1.0f;
+
+   /* in NDC space */
+   *min_gbx = ((float) (center_x - half_len) - mat->translate[0]) / scale_x;
+   *max_gbx = ((float) (center_x + half_len) - mat->translate[0]) / scale_x;
+   *min_gby = ((float) (center_y - half_len) - mat->translate[1]) / scale_y;
+   *max_gby = ((float) (center_y + half_len) - mat->translate[1]) / scale_y;
+}
+
+static void
+viewport_matrix_get_extent(const struct ilo_state_viewport_matrix_info *mat,
+                           int axis, float *min, float *max)
+{
+   const float scale_abs = fabsf(mat->scale[axis]);
+
+   *min = -1.0f * scale_abs + mat->translate[axis];
+   *max =  1.0f * scale_abs + mat->translate[axis];
+}
+
+static bool
+viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(struct ilo_state_viewport *vp,
+                                          const struct ilo_dev *dev,
+                                          const struct ilo_state_viewport_matrix_info *matrices,
+                                          uint8_t count)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat = &matrices[i];
+      float min_gbx, max_gbx, min_gby, max_gby;
+      uint32_t dw[16];
+
+      viewport_matrix_get_gen6_guardband(dev, mat,
+            &min_gbx, &max_gbx, &min_gby, &max_gby);
+
+      dw[0] = fui(mat->scale[0]);
+      dw[1] = fui(mat->scale[1]);
+      dw[2] = fui(mat->scale[2]);
+      dw[3] = fui(mat->translate[0]);
+      dw[4] = fui(mat->translate[1]);
+      dw[5] = fui(mat->translate[2]);
+      dw[6] = 0;
+      dw[7] = 0;
+
+      dw[8] = fui(min_gbx);
+      dw[9] = fui(max_gbx);
+      dw[10] = fui(min_gby);
+      dw[11] = fui(max_gby);
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         float min_x, max_x, min_y, max_y;
+
+         viewport_matrix_get_extent(mat, 0, &min_x, &max_x);
+         viewport_matrix_get_extent(mat, 1, &min_y, &max_y);
+
+         dw[12] = fui(min_x);
+         dw[13] = fui(max_x - 1.0f);
+         dw[14] = fui(min_y);
+         dw[15] = fui(max_y - 1.0f);
+      } else {
+         dw[12] = 0;
+         dw[13] = 0;
+         dw[14] = 0;
+         dw[15] = 0;
+      }
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->sf_clip[i]) >= 16);
+      memcpy(vp->sf_clip[i], dw, sizeof(dw));
+   }
+
+   return true;
+}
+
+static bool
+viewport_matrix_set_gen6_CC_VIEWPORT(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_viewport_matrix_info *matrices,
+                                     uint8_t count)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat = &matrices[i];
+      float min_z, max_z;
+
+      viewport_matrix_get_extent(mat, 2, &min_z, &max_z);
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->cc[i]) >= 2);
+      vp->cc[i][0] = fui(min_z);
+      vp->cc[i][1] = fui(max_z);
+   }
+
+   return true;
+}
+
+static bool
+viewport_scissor_set_gen6_SCISSOR_RECT(struct ilo_state_viewport *vp,
+                                       const struct ilo_dev *dev,
+                                       const struct ilo_state_viewport_scissor_info *scissors,
+                                       uint8_t count)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_scissor_info *scissor = &scissors[i];
+      uint16_t min_x, min_y, max_x, max_y;
+      uint32_t dw0, dw1;
+
+      min_x = (scissor->min_x < max_size) ? scissor->min_x : max_size - 1;
+      min_y = (scissor->min_y < max_size) ? scissor->min_y : max_size - 1;
+      max_x = (scissor->max_x < max_size) ? scissor->max_x : max_size - 1;
+      max_y = (scissor->max_y < max_size) ? scissor->max_y : max_size - 1;
+
+      dw0 = min_y << GEN6_SCISSOR_DW0_MIN_Y__SHIFT |
+            min_x << GEN6_SCISSOR_DW0_MIN_X__SHIFT;
+      dw1 = max_y << GEN6_SCISSOR_DW1_MAX_Y__SHIFT |
+            max_x << GEN6_SCISSOR_DW1_MAX_X__SHIFT;
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->scissor[i]) >= 2);
+      vp->scissor[i][0] = dw0;
+      vp->scissor[i][1] = dw1;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_viewport_init(struct ilo_state_viewport *vp,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_viewport_info *info)
+{
+   const size_t elem_size = ilo_state_viewport_data_size(dev, 1);
+
+   assert(ilo_is_zeroed(vp, sizeof(*vp)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   vp->data = info->data;
+
+   if (info->data_size / elem_size < ILO_STATE_VIEWPORT_MAX_COUNT)
+      vp->array_size = info->data_size / elem_size;
+   else
+      vp->array_size = ILO_STATE_VIEWPORT_MAX_COUNT;
+
+   return ilo_state_viewport_set_params(vp, dev, &info->params, false);
+}
+
+bool
+ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp,
+                                  const struct ilo_dev *dev,
+                                  void *data, size_t data_size)
+{
+   struct ilo_state_viewport_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.data = data;
+   info.data_size = data_size;
+
+   return ilo_state_viewport_init(vp, dev, &info);
+}
+
+bool
+ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     void *data, size_t data_size)
+{
+   struct ilo_state_viewport_info info;
+   struct ilo_state_viewport_matrix_info mat;
+   struct ilo_state_viewport_scissor_info sci;
+
+   memset(&info, 0, sizeof(info));
+   memset(&mat, 0, sizeof(mat));
+   memset(&sci, 0, sizeof(sci));
+
+   info.data = data;
+   info.data_size = data_size;
+   info.params.matrices = &mat;
+   info.params.scissors = &sci;
+   info.params.count = 1;
+
+   mat.scale[0] = 1.0f;
+   mat.scale[1] = 1.0f;
+   mat.scale[2] = 1.0f;
+
+   return ilo_state_viewport_init(vp, dev, &info);
+}
+
+static void
+viewport_set_count(struct ilo_state_viewport *vp,
+                   const struct ilo_dev *dev,
+                   uint8_t count)
+{
+   assert(count <= vp->array_size);
+
+   vp->count = count;
+   vp->sf_clip = (uint32_t (*)[16]) vp->data;
+   vp->cc =      (uint32_t (*)[ 2]) (vp->sf_clip + count);
+   vp->scissor = (uint32_t (*)[ 2]) (vp->cc + count);
+}
+
+bool
+ilo_state_viewport_set_params(struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_viewport_params_info *params,
+                              bool scissors_only)
+{
+   bool ret = true;
+
+   if (scissors_only) {
+      assert(vp->count == params->count);
+
+      ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev,
+            params->scissors, params->count);
+   } else {
+      viewport_set_count(vp, dev, params->count);
+
+      ret &= viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(vp, dev,
+            params->matrices, params->count);
+      ret &= viewport_matrix_set_gen6_CC_VIEWPORT(vp, dev,
+            params->matrices, params->count);
+      ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev,
+            params->scissors, params->count);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              struct ilo_state_viewport_delta *delta)
+{
+   delta->dirty = ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                  ILO_STATE_VIEWPORT_CC_VIEWPORT |
+                  ILO_STATE_VIEWPORT_SCISSOR_RECT;
+}
+
+void
+ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_viewport *old,
+                             struct ilo_state_viewport_delta *delta)
+{
+   const size_t sf_clip_size = sizeof(vp->sf_clip[0]) * vp->count;
+   const size_t cc_size = sizeof(vp->cc[0]) * vp->count;
+   const size_t scissor_size = sizeof(vp->scissor[0]) * vp->count;
+
+   /* no shallow copying */
+   assert(vp->data != old->data);
+
+   if (vp->count != old->count) {
+      ilo_state_viewport_full_delta(vp, dev, delta);
+      return;
+   }
+
+   delta->dirty = 0;
+
+   if (memcmp(vp->sf_clip, old->sf_clip, sf_clip_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT;
+
+   if (memcmp(vp->cc, old->cc, cc_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_CC_VIEWPORT;
+
+   if (memcmp(vp->scissor, old->scissor, scissor_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_SCISSOR_RECT;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.h b/src/gallium/drivers/ilo/core/ilo_state_viewport.h
new file mode 100644
index 00000000000..b42ad6571da
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.h
@@ -0,0 +1,132 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_VIEWPORT_H
+#define ILO_STATE_VIEWPORT_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 38:
+ *
+ *     "... 16 sets of viewport (VP) state parameters in the Clip unit's
+ *      VertexClipTest function and in the SF unit's ViewportMapping and
+ *      Scissor functions."
+ */
+#define ILO_STATE_VIEWPORT_MAX_COUNT 16
+
+enum ilo_state_viewport_dirty_bits {
+   ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT             = (1 << 0),
+   ILO_STATE_VIEWPORT_CC_VIEWPORT                  = (1 << 1),
+   ILO_STATE_VIEWPORT_SCISSOR_RECT                 = (1 << 2),
+};
+
+struct ilo_state_viewport_matrix_info {
+   float scale[3];
+   float translate[3];
+};
+
+struct ilo_state_viewport_scissor_info {
+   /* all inclusive */
+   uint16_t min_x;
+   uint16_t min_y;
+   uint16_t max_x;
+   uint16_t max_y;
+};
+
+struct ilo_state_viewport_params_info {
+   const struct ilo_state_viewport_matrix_info *matrices;
+   const struct ilo_state_viewport_scissor_info *scissors;
+   uint8_t count;
+};
+
+struct ilo_state_viewport_info {
+   void *data;
+   size_t data_size;
+
+   struct ilo_state_viewport_params_info params;
+};
+
+struct ilo_state_viewport {
+   void *data;
+   uint8_t array_size;
+
+   uint8_t count;
+   uint32_t (*sf_clip)[16];
+   uint32_t (*cc)[2];
+   uint32_t (*scissor)[2];
+};
+
+struct ilo_state_viewport_delta {
+   uint32_t dirty;
+};
+
+static inline size_t
+ilo_state_viewport_data_size(const struct ilo_dev *dev, uint8_t array_size)
+{
+   const struct ilo_state_viewport *vp = NULL;
+   return (sizeof(vp->sf_clip[0]) +
+           sizeof(vp->cc[0]) +
+           sizeof(vp->scissor[0])) * array_size;
+}
+
+bool
+ilo_state_viewport_init(struct ilo_state_viewport *vp,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_viewport_info *info);
+
+bool
+ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp,
+                                  const struct ilo_dev *dev,
+                                  void *data, size_t data_size);
+
+bool
+ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     void *data, size_t data_size);
+
+bool
+ilo_state_viewport_set_params(struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_viewport_params_info *params,
+                              bool scissors_only);
+
+void
+ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              struct ilo_state_viewport_delta *delta);
+
+void
+ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_viewport *old,
+                             struct ilo_state_viewport_delta *delta);
+
+#endif /* ILO_STATE_VIEWPORT_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
new file mode 100644
index 00000000000..901fedb5599
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -0,0 +1,727 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "intel_winsys.h"
+
+#include "ilo_debug.h"
+#include "ilo_image.h"
+#include "ilo_state_zs.h"
+
+static bool
+zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                      const struct ilo_dev *dev)
+{
+   const enum gen_depth_format format = GEN6_ZFORMAT_D32_FLOAT;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw1 = GEN6_SURFTYPE_NULL << GEN7_DEPTH_DW1_TYPE__SHIFT |
+            format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+   } else {
+      dw1 = GEN6_SURFTYPE_NULL << GEN6_DEPTH_DW1_TYPE__SHIFT |
+            GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
+            format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = 0;
+   zs->depth[2] = 0;
+   zs->depth[3] = 0;
+   zs->depth[4] = 0;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static enum gen_surface_type
+get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (img->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static enum gen_depth_format
+get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (img->format) {
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   } else {
+      switch (img->format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   }
+}
+
+static bool
+zs_validate_gen6(const struct ilo_dev *dev,
+                 const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 315:
+    *
+    *      The stencil buffer has a format of S8_UINT, and shares Surface
+    *      Type, Height, Width, and Depth, Minimum Array Element, Render
+    *      Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth
+    *      Buffer Object Control State fields of the depth buffer.
+    */
+   if (info->z_img == info->s_img) {
+      assert(info->z_img->target == info->s_img->target &&
+             info->z_img->width0 == info->s_img->width0 &&
+             info->z_img->height0 == info->s_img->height0 &&
+             info->z_img->depth0 == info->s_img->depth0);
+   }
+
+   assert(info->level < img->level_count);
+   assert(img->bo_stride);
+
+   if (info->hiz_enable) {
+      assert(info->z_img &&
+             ilo_image_can_enable_aux(info->z_img, info->level));
+   }
+
+   if (info->is_cube_map) {
+      assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+       *
+       *     "For cube maps, Width must be set equal to Height."
+       */
+      assert(img->width0 == img->height0);
+   }
+
+   if (info->z_img)
+      assert(info->z_img->tiling == GEN6_TILING_Y);
+   if (info->s_img)
+      assert(info->s_img->tiling == GEN8_TILING_W);
+
+   return true;
+}
+
+static void
+get_gen6_max_extent(const struct ilo_dev *dev,
+                    const struct ilo_image *img,
+                    uint16_t *max_w, uint16_t *max_h)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+      *max_w = max_size;
+      *max_h = 1;
+      break;
+   case GEN6_SURFTYPE_2D:
+      *max_w = max_size;
+      *max_h = max_size;
+      break;
+   case GEN6_SURFTYPE_3D:
+      *max_w = 2048;
+      *max_h = 2048;
+      break;
+   default:
+      assert(!"invalid surface type");
+      *max_w = 1;
+      *max_h = 1;
+      break;
+   }
+}
+
+static void
+get_gen6_hiz_alignments(const struct ilo_dev *dev,
+                        const struct ilo_image *img,
+                        uint16_t *align_w, uint16_t *align_h)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 313:
+    *
+    *     "A rectangle primitive representing the clear area is delivered. The
+    *      primitive must adhere to the following restrictions on size:
+    *
+    *      - If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
+    *        aligned to an 8x4 pixel block relative to the upper left corner
+    *        of the depth buffer, and contain an integer number of these pixel
+    *        blocks, and all 8x4 pixels must be lit.
+    *      - If Number of Multisamples is NUMSAMPLES_4, the rectangle must be
+    *        aligned to a 4x2 pixel block (8x4 sample block) relative to the
+    *        upper left corner of the depth buffer, and contain an integer
+    *        number of these pixel blocks, and all samples of the 4x2 pixels
+    *        must be lit
+    *      - If Number of Multisamples is NUMSAMPLES_8, the rectangle must be
+    *        aligned to a 2x2 pixel block (8x4 sample block) relative to the
+    *        upper left corner of the depth buffer, and contain an integer
+    *        number of these pixel blocks, and all samples of the 2x2 pixels
+    *        must be list."
+    *
+    * Experiments on Gen7.5 show that HiZ resolve also requires the rectangle
+    * to be aligned to 8x4 sample blocks.  But to be on the safe side, we
+    * always require a level to be aligned when HiZ is enabled.
+    */
+   switch (img->sample_count) {
+   case 1:
+      *align_w = 8;
+      *align_h = 4;
+      break;
+   case 2:
+      *align_w = 4;
+      *align_h = 4;
+      break;
+   case 4:
+      *align_w = 4;
+      *align_h = 2;
+      break;
+   case 8:
+      *align_w = 2;
+      *align_h = 2;
+      break;
+   case 16:
+      *align_w = 2;
+      *align_h = 1;
+      break;
+   default:
+      assert(!"unknown sample count");
+      *align_w = 1;
+      *align_h = 1;
+      break;
+   }
+}
+
+static bool
+zs_get_gen6_depth_extent(const struct ilo_dev *dev,
+                         const struct ilo_state_zs_info *info,
+                         uint16_t *width, uint16_t *height)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+   uint16_t w, h, max_w, max_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = img->width0;
+   h = img->height0;
+
+   if (info->hiz_enable) {
+      uint16_t align_w, align_h;
+
+      get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h);
+
+      /*
+       * We want to force 8x4 alignment, but we can do so only for level 0 and
+       * only when it is padded.  ilo_image should know all these.
+       */
+      if (info->level)
+         assert(w % align_w == 0 && h % align_h == 0);
+
+      w = align(w, align_w);
+      h = align(h, align_h);
+   }
+
+   get_gen6_max_extent(dev, img, &max_w, &max_h);
+   assert(w && h && w <= max_w && h <= max_h);
+
+   *width = w - 1;
+   *height = h - 1;
+
+   return true;
+}
+
+static bool
+zs_get_gen6_depth_slices(const struct ilo_dev *dev,
+                         const struct ilo_state_zs_info *info,
+                         uint16_t *depth, uint16_t *min_array_elem,
+                         uint16_t *rt_view_extent)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+   uint16_t max_slice, d;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 325:
+    *
+    *     "This field (Depth) specifies the total number of levels for a
+    *      volume texture or the number of array elements allowed to be
+    *      accessed starting at the Minimum Array Element for arrayed
+    *      surfaces. If the volume texture is MIP-mapped, this field specifies
+    *      the depth of the base MIP level."
+    */
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+   case GEN6_SURFTYPE_2D:
+      max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
+
+      assert(img->array_size <= max_slice);
+      max_slice = img->array_size;
+
+      d = info->slice_count;
+      if (info->is_cube_map) {
+         /*
+          * Minumum Array Element and Depth must be 0; Render Target View
+          * Extent is ignored.
+          */
+         if (info->slice_base || d != 6) {
+            ilo_warn("no cube array dpeth buffer\n");
+            return false;
+         }
+
+         d /= 6;
+      }
+      break;
+   case GEN6_SURFTYPE_3D:
+      max_slice = 2048;
+
+      assert(img->depth0 <= max_slice);
+      max_slice = u_minify(img->depth0, info->level);
+
+      d = img->depth0;
+      break;
+   default:
+      assert(!"invalid surface type");
+      return false;
+      break;
+   }
+
+   if (!info->slice_count ||
+       info->slice_base + info->slice_count > max_slice) {
+      ilo_warn("invalid slice range\n");
+      return false;
+   }
+
+   assert(d);
+   *depth = d - 1;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 325:
+    *
+    *     "For 1D and 2D Surfaces:
+    *      This field (Minimum Array Element) indicates the minimum array
+    *      element that can be accessed as part of this surface. The delivered
+    *      array index is added to this field before being used to address the
+    *      surface.
+    *
+    *      For 3D Surfaces:
+    *      This field indicates the minimum `R' coordinate on the LOD
+    *      currently being rendered to.  This field is added to the delivered
+    *      array index before it is used to address the surface.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *min_array_elem = info->slice_base;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 326:
+    *
+    *     "For 3D Surfaces:
+    *      This field (Render Target View Extent) indicates the extent of the
+    *      accessible `R' coordinates minus 1 on the LOD currently being
+    *      rendered to.
+    *
+    *      For 1D and 2D Surfaces:
+    *      This field must be set to the same value as the Depth field.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *rt_view_extent = info->slice_count - 1;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_zs_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   enum gen_surface_type type;
+   enum gen_depth_format format;
+   uint32_t dw1, dw2, dw3, dw4;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!zs_validate_gen6(dev, info) ||
+       !zs_get_gen6_depth_extent(dev, info, &width, &height) ||
+       !zs_get_gen6_depth_slices(dev, info, &depth, &array_base,
+                                 &view_extent))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
+                          get_gen6_surface_type(dev, info->s_img);
+
+   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
+      GEN6_ZFORMAT_D32_FLOAT;
+
+   /*
+    * From the Ironlake PRM, volume 2 part 1, page 330:
+    *
+    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
+    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+    *
+    *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set
+    *      to the same value (enabled or disabled) as Hierarchical Depth
+    *      Buffer Enable."
+    */
+   if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
+      format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+
+   /* info->z_readonly and info->s_readonly are ignored on Gen6 */
+   dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT |
+         GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
+         format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+
+   if (info->z_img)
+      dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
+
+   if (info->hiz_enable || !info->z_img) {
+      dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
+             GEN6_DEPTH_DW1_SEPARATE_STENCIL;
+   }
+
+   dw2 = 0;
+   dw3 = height << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
+         width << GEN6_DEPTH_DW3_WIDTH__SHIFT |
+         info->level << GEN6_DEPTH_DW3_LOD__SHIFT |
+         GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
+   dw4 = depth << GEN6_DEPTH_DW4_DEPTH__SHIFT |
+         array_base << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = dw2;
+   zs->depth[2] = dw3;
+   zs->depth[3] = dw4;
+   zs->depth[4] = 0;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static bool
+zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_zs_info *info)
+{
+   enum gen_surface_type type;
+   enum gen_depth_format format;
+   uint16_t width, height, depth;
+   uint16_t array_base, view_extent;
+   uint32_t dw1, dw2, dw3, dw4, dw6;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!zs_validate_gen6(dev, info) ||
+       !zs_get_gen6_depth_extent(dev, info, &width, &height) ||
+       !zs_get_gen6_depth_slices(dev, info, &depth, &array_base,
+                                 &view_extent))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
+                          get_gen6_surface_type(dev, info->s_img);
+
+   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
+      GEN6_ZFORMAT_D32_FLOAT;
+
+   dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT |
+         format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+
+   if (info->z_img) {
+      if (!info->z_readonly)
+         dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
+      if (info->hiz_enable)
+         dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
+
+      dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT;
+   }
+
+   if (info->s_img && !info->s_readonly)
+      dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE;
+
+   dw2 = 0;
+   dw3 = height << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
+         width << GEN7_DEPTH_DW3_WIDTH__SHIFT |
+         info->level << GEN7_DEPTH_DW3_LOD__SHIFT;
+   dw4 = depth << GEN7_DEPTH_DW4_DEPTH__SHIFT |
+         array_base << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT;
+   dw6 = view_extent << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8) && info->z_img) {
+      assert(info->z_img->walk_layer_height % 4 == 0);
+      /* note that DW is off-by-one for Gen8+ */
+      dw6 |= (info->z_img->walk_layer_height / 4) <<
+         GEN8_DEPTH_DW7_QPITCH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = dw2;
+   zs->depth[2] = dw3;
+   zs->depth[3] = dw4;
+   zs->depth[4] = dw6;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs,
+                                        const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3);
+   zs->stencil[0] = 0;
+   zs->stencil[1] = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      zs->stencil[2] = 0;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs,
+                                   const struct ilo_dev *dev,
+                                   const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = info->s_img;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(img->bo_stride);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 329:
+    *
+    *     "The pitch must be set to 2x the value computed based on width, as
+    *      the stencil buffer is stored with two rows interleaved."
+    *
+    * For Gen7+, we still dobule the stride because we did not double the
+    * slice widths when initializing ilo_image.
+    */
+   dw1 = (img->bo_stride * 2 - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE;
+
+   dw2 = 0;
+   /* offset to the level as Gen6 does not support mipmapped stencil */
+   if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+      unsigned x, y;
+
+      ilo_image_get_slice_pos(img, info->level, 0, &x, &y);
+      ilo_image_pos_to_mem(img, x, y, &x, &y);
+      dw2 |= ilo_image_mem_to_raw(img, x, y);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3);
+   zs->stencil[0] = dw1;
+   zs->stencil[1] = dw2;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw4;
+
+      assert(img->walk_layer_height % 4 == 0);
+      dw4 = (img->walk_layer_height / 4) << GEN8_STENCIL_DW4_QPITCH__SHIFT;
+
+      zs->stencil[2] = dw4;
+   }
+
+   return true;
+}
+
+static bool
+zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                           const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3);
+   zs->hiz[0] = 0;
+   zs->hiz[1] = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      zs->hiz[2] = 0;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = info->z_img;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(img->aux.bo_stride);
+
+   dw1 = (img->aux.bo_stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT;
+
+   dw2 = 0;
+   /* offset to the level as Gen6 does not support mipmapped HiZ */
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      dw2 |= img->aux.walk_lod_offsets[info->level];
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3);
+   zs->hiz[0] = dw1;
+   zs->hiz[1] = dw2;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw4;
+
+      assert(img->aux.walk_layer_height % 4 == 0);
+      dw4 = (img->aux.walk_layer_height / 4) << GEN8_HIZ_DW4_QPITCH__SHIFT;
+
+      zs->hiz[2] = dw4;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev,
+                  const struct ilo_state_zs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(zs, sizeof(*zs)));
+
+   if (info->z_img || info->s_img) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         ret &= zs_set_gen7_3DSTATE_DEPTH_BUFFER(zs, dev, info);
+      else
+         ret &= zs_set_gen6_3DSTATE_DEPTH_BUFFER(zs, dev, info);
+   } else {
+      ret &= zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(zs, dev);
+   }
+
+   if (info->s_img)
+      ret &= zs_set_gen6_3DSTATE_STENCIL_BUFFER(zs, dev, info);
+   else
+      ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev);
+
+   if (info->z_img && info->hiz_enable)
+      ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info);
+   else
+      ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+
+   zs->z_readonly = info->z_readonly;
+   zs->s_readonly = info->s_readonly;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_zs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_zs_init(zs, dev, &info);
+}
+
+bool
+ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
+                         const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * Separate stencil must be disabled simultaneously on Gen6.  We can make
+    * it work when there is no stencil buffer, but it is probably not worth
+    * it.
+    */
+   assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+
+   zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
+   zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
new file mode 100644
index 00000000000..98212daf74f
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -0,0 +1,93 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef ILO_STATE_ZS_H
+#define ILO_STATE_ZS_H
+
+#include "genhw/genhw.h"
+#include "intel_winsys.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_image;
+
+struct ilo_state_zs_info {
+   /* both are optional */
+   const struct ilo_image *z_img;
+   const struct ilo_image *s_img;
+
+   /* ignored prior to Gen7 */
+   bool z_readonly;
+   bool s_readonly;
+
+   bool hiz_enable;
+   bool is_cube_map;
+
+   uint8_t level;
+   uint16_t slice_base;
+   uint16_t slice_count;
+};
+
+struct ilo_state_zs {
+   uint32_t depth[5];
+   uint32_t stencil[3];
+   uint32_t hiz[3];
+
+   /* TODO move this to ilo_image */
+   enum gen_depth_format depth_format;
+
+   bool z_readonly;
+   bool s_readonly;
+
+   /* managed by users */
+   struct intel_bo *depth_bo;
+   struct intel_bo *stencil_bo;
+   struct intel_bo *hiz_bo;
+};
+
+bool
+ilo_state_zs_init(struct ilo_state_zs *zs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_zs_info *info);
+
+bool
+ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
+                         const struct ilo_dev *dev);
+
+static inline enum gen_depth_format
+ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs,
+                              const struct ilo_dev *dev)
+{
+   return zs->depth_format;
+}
+
+#endif /* ILO_STATE_ZS_H */
diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
index 24d726adcb3..5a0bb4f8d77 100644
--- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
@@ -97,6 +97,9 @@ enum gen_mi_alu_operand {
 #define GEN6_MI_LENGTH__MASK					0x0000003f
 #define GEN6_MI_LENGTH__SHIFT					0
 #define GEN6_MI_NOOP__SIZE					1
+#define GEN6_MI_NOOP_DW0_WRITE_NOPID				(0x1 << 22)
+#define GEN6_MI_NOOP_DW0_VALUE__MASK				0x003fffff
+#define GEN6_MI_NOOP_DW0_VALUE__SHIFT				0
 
 #define GEN75_MI_SET_PREDICATE__SIZE				1
 #define GEN75_MI_SET_PREDICATE_DW0_PREDICATE__MASK		0x00000003
diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
index 2bdd72b29bc..c51e4f78bc0 100644
--- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
@@ -35,6 +35,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN6_REG_MASK__MASK					0xffff0000
 #define GEN6_REG_MASK__SHIFT					16
 #define GEN6_REG__SIZE						0x400000
+#define GEN6_REG_NOPID						0x2094
+
 #define GEN7_REG_HS_INVOCATION_COUNT				0x2300
 
 #define GEN7_REG_DS_INVOCATION_COUNT				0x2308
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
index d25542e8cc2..52173fe5d07 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
@@ -32,7 +32,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
 
-enum gen_prim_type {
+enum gen_3dprim_type {
     GEN6_3DPRIM_POINTLIST				      = 0x1,
     GEN6_3DPRIM_LINELIST				      = 0x2,
     GEN6_3DPRIM_LINESTRIP				      = 0x3,
@@ -105,6 +105,12 @@ enum gen_state_alignment {
     GEN8_ALIGNMENT_SURFACE_STATE			      = 0x40,
 };
 
+enum gen_index_format {
+    GEN6_INDEX_BYTE					      = 0x0,
+    GEN6_INDEX_WORD					      = 0x1,
+    GEN6_INDEX_DWORD					      = 0x2,
+};
+
 enum gen_vf_component {
     GEN6_VFCOMP_NOSTORE					      = 0x0,
     GEN6_VFCOMP_STORE_SRC				      = 0x1,
@@ -123,6 +129,87 @@ enum gen_depth_format {
     GEN6_ZFORMAT_D16_UNORM				      = 0x5,
 };
 
+enum gen_reorder_mode {
+    GEN7_REORDER_LEADING				      = 0x0,
+    GEN7_REORDER_TRAILING				      = 0x1,
+};
+
+enum gen_clip_mode {
+    GEN6_CLIPMODE_NORMAL				      = 0x0,
+    GEN6_CLIPMODE_REJECT_ALL				      = 0x3,
+    GEN6_CLIPMODE_ACCEPT_ALL				      = 0x4,
+};
+
+enum gen_front_winding {
+    GEN6_FRONTWINDING_CW				      = 0x0,
+    GEN6_FRONTWINDING_CCW				      = 0x1,
+};
+
+enum gen_fill_mode {
+    GEN6_FILLMODE_SOLID					      = 0x0,
+    GEN6_FILLMODE_WIREFRAME				      = 0x1,
+    GEN6_FILLMODE_POINT					      = 0x2,
+};
+
+enum gen_cull_mode {
+    GEN6_CULLMODE_BOTH					      = 0x0,
+    GEN6_CULLMODE_NONE					      = 0x1,
+    GEN6_CULLMODE_FRONT					      = 0x2,
+    GEN6_CULLMODE_BACK					      = 0x3,
+};
+
+enum gen_pixel_location {
+    GEN6_PIXLOC_CENTER					      = 0x0,
+    GEN6_PIXLOC_UL_CORNER				      = 0x1,
+};
+
+enum gen_sample_count {
+    GEN6_NUMSAMPLES_1					      = 0x0,
+    GEN8_NUMSAMPLES_2					      = 0x1,
+    GEN6_NUMSAMPLES_4					      = 0x2,
+    GEN7_NUMSAMPLES_8					      = 0x3,
+    GEN8_NUMSAMPLES_16					      = 0x4,
+};
+
+enum gen_inputattr_select {
+    GEN6_INPUTATTR_NORMAL				      = 0x0,
+    GEN6_INPUTATTR_FACING				      = 0x1,
+    GEN6_INPUTATTR_W					      = 0x2,
+    GEN6_INPUTATTR_FACING_W				      = 0x3,
+};
+
+enum gen_zw_interp {
+    GEN6_ZW_INTERP_PIXEL				      = 0x0,
+    GEN6_ZW_INTERP_CENTROID				      = 0x2,
+    GEN6_ZW_INTERP_SAMPLE				      = 0x3,
+};
+
+enum gen_position_offset {
+    GEN6_POSOFFSET_NONE					      = 0x0,
+    GEN6_POSOFFSET_CENTROID				      = 0x2,
+    GEN6_POSOFFSET_SAMPLE				      = 0x3,
+};
+
+enum gen_edsc_mode {
+    GEN7_EDSC_NORMAL					      = 0x0,
+    GEN7_EDSC_PSEXEC					      = 0x1,
+    GEN7_EDSC_PREPS					      = 0x2,
+};
+
+enum gen_pscdepth_mode {
+    GEN7_PSCDEPTH_OFF					      = 0x0,
+    GEN7_PSCDEPTH_ON					      = 0x1,
+    GEN7_PSCDEPTH_ON_GE					      = 0x2,
+    GEN7_PSCDEPTH_ON_LE					      = 0x3,
+};
+
+enum gen_msrast_mode {
+    GEN6_MSRASTMODE_OFF_PIXEL				      = 0x0,
+    GEN6_MSRASTMODE_OFF_PATTERN				      = 0x1,
+    GEN6_MSRASTMODE_ON_PIXEL				      = 0x2,
+    GEN6_MSRASTMODE_ON_PATTERN				      = 0x3,
+};
+
 #define GEN6_INTERP_NONPERSPECTIVE_SAMPLE			(0x1 << 5)
 #define GEN6_INTERP_NONPERSPECTIVE_CENTROID			(0x1 << 4)
 #define GEN6_INTERP_NONPERSPECTIVE_PIXEL			(0x1 << 3)
@@ -285,9 +372,6 @@ enum gen_depth_format {
 #define GEN6_IB_DW0_CUT_INDEX_ENABLE				(0x1 << 10)
 #define GEN6_IB_DW0_FORMAT__MASK				0x00000300
 #define GEN6_IB_DW0_FORMAT__SHIFT				8
-#define GEN6_IB_DW0_FORMAT_BYTE					(0x0 << 8)
-#define GEN6_IB_DW0_FORMAT_WORD					(0x1 << 8)
-#define GEN6_IB_DW0_FORMAT_DWORD				(0x2 << 8)
 
 
 
@@ -295,9 +379,6 @@ enum gen_depth_format {
 
 #define GEN8_IB_DW1_FORMAT__MASK				0x00000300
 #define GEN8_IB_DW1_FORMAT__SHIFT				8
-#define GEN8_IB_DW1_FORMAT_BYTE					(0x0 << 8)
-#define GEN8_IB_DW1_FORMAT_WORD					(0x1 << 8)
-#define GEN8_IB_DW1_FORMAT_DWORD				(0x2 << 8)
 #define GEN8_IB_DW1_MOCS__MASK					0x0000007f
 #define GEN8_IB_DW1_MOCS__SHIFT					0
 
@@ -313,8 +394,8 @@ enum gen_depth_format {
 
 
 #define GEN8_INSTANCING_DW1_ENABLE				(0x1 << 8)
-#define GEN8_INSTANCING_DW1_VB_INDEX__MASK			0x0000003f
-#define GEN8_INSTANCING_DW1_VB_INDEX__SHIFT			0
+#define GEN8_INSTANCING_DW1_VE_INDEX__MASK			0x0000003f
+#define GEN8_INSTANCING_DW1_VE_INDEX__SHIFT			0
 
 
 #define GEN8_3DSTATE_VF_SGVS__SIZE				2
@@ -614,7 +695,7 @@ enum gen_depth_format {
 #define GEN6_GS_DW5_SO_STATISTICS				(0x1 << 9)
 #define GEN6_GS_DW5_RENDER_ENABLE				(0x1 << 8)
 
-#define GEN6_GS_DW6_REORDER_ENABLE				(0x1 << 30)
+#define GEN6_GS_DW6_REORDER_LEADING_ENABLE			(0x1 << 30)
 #define GEN6_GS_DW6_DISCARD_ADJACENCY				(0x1 << 29)
 #define GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE				(0x1 << 28)
 #define GEN6_GS_DW6_SVBI_POST_INC_ENABLE			(0x1 << 27)
@@ -666,11 +747,9 @@ enum gen_depth_format {
 #define GEN7_GS_DW5_INVOCATION_INCR__SHIFT			5
 #define GEN7_GS_DW5_INCLUDE_PRIMITIVE_ID			(0x1 << 4)
 #define GEN7_GS_DW5_HINT					(0x1 << 3)
-#define GEN7_GS_DW5_REORDER_ENABLE				(0x1 << 2)
-#define GEN75_GS_DW5_REORDER__MASK				0x00000004
-#define GEN75_GS_DW5_REORDER__SHIFT				2
-#define GEN75_GS_DW5_REORDER_LEADING				(0x0 << 2)
-#define GEN75_GS_DW5_REORDER_TRAILING				(0x1 << 2)
+#define GEN7_GS_DW5_REORDER_LEADING_ENABLE			(0x1 << 2)
+#define GEN75_GS_DW5_REORDER_MODE__MASK				0x00000004
+#define GEN75_GS_DW5_REORDER_MODE__SHIFT			2
 #define GEN7_GS_DW5_DISCARD_ADJACENCY				(0x1 << 1)
 #define GEN7_GS_DW5_GS_ENABLE					(0x1 << 0)
 
@@ -727,10 +806,8 @@ enum gen_depth_format {
 #define GEN8_GS_DW7_INVOCATION_INCR__SHIFT			5
 #define GEN8_GS_DW7_INCLUDE_PRIMITIVE_ID			(0x1 << 4)
 #define GEN8_GS_DW7_HINT					(0x1 << 3)
-#define GEN8_GS_DW7_REORDER__MASK				0x00000004
-#define GEN8_GS_DW7_REORDER__SHIFT				2
-#define GEN8_GS_DW7_REORDER_LEADING				(0x0 << 2)
-#define GEN8_GS_DW7_REORDER_TRAILING				(0x1 << 2)
+#define GEN8_GS_DW7_REORDER_MODE__MASK				0x00000004
+#define GEN8_GS_DW7_REORDER_MODE__SHIFT				2
 #define GEN8_GS_DW7_DISCARD_ADJACENCY				(0x1 << 1)
 #define GEN8_GS_DW7_GS_ENABLE					(0x1 << 0)
 
@@ -758,10 +835,8 @@ enum gen_depth_format {
 #define GEN7_SO_DW1_RENDER_DISABLE				(0x1 << 30)
 #define GEN7_SO_DW1_RENDER_STREAM_SELECT__MASK			0x18000000
 #define GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT			27
-#define GEN7_SO_DW1_REORDER__MASK				0x04000000
-#define GEN7_SO_DW1_REORDER__SHIFT				26
-#define GEN7_SO_DW1_REORDER_LEADING				(0x0 << 26)
-#define GEN7_SO_DW1_REORDER_TRAILING				(0x1 << 26)
+#define GEN7_SO_DW1_REORDER_MODE__MASK				0x04000000
+#define GEN7_SO_DW1_REORDER_MODE__SHIFT				26
 #define GEN7_SO_DW1_STATISTICS					(0x1 << 25)
 #define GEN7_SO_DW1_BUFFER_ENABLES__MASK			0x00000f00
 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT			8
@@ -862,21 +937,15 @@ enum gen_depth_format {
 #define GEN6_3DSTATE_CLIP__SIZE					4
 
 
-#define GEN7_CLIP_DW1_FRONTWINDING__MASK			0x00100000
-#define GEN7_CLIP_DW1_FRONTWINDING__SHIFT			20
-#define GEN7_CLIP_DW1_FRONTWINDING_CW				(0x0 << 20)
-#define GEN7_CLIP_DW1_FRONTWINDING_CCW				(0x1 << 20)
+#define GEN7_CLIP_DW1_FRONT_WINDING__MASK			0x00100000
+#define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT			20
 #define GEN7_CLIP_DW1_SUBPIXEL__MASK				0x00080000
 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT				19
 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS				(0x0 << 19)
 #define GEN7_CLIP_DW1_SUBPIXEL_4BITS				(0x1 << 19)
 #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE				(0x1 << 18)
-#define GEN7_CLIP_DW1_CULLMODE__MASK				0x00030000
-#define GEN7_CLIP_DW1_CULLMODE__SHIFT				16
-#define GEN7_CLIP_DW1_CULLMODE_BOTH				(0x0 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_NONE				(0x1 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_FRONT				(0x2 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_BACK				(0x3 << 16)
+#define GEN7_CLIP_DW1_CULL_MODE__MASK				0x00030000
+#define GEN7_CLIP_DW1_CULL_MODE__SHIFT				16
 #define GEN6_CLIP_DW1_STATISTICS				(0x1 << 10)
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK			0x000000ff
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT			0
@@ -891,11 +960,8 @@ enum gen_depth_format {
 #define GEN6_CLIP_DW2_GB_TEST_ENABLE				(0x1 << 26)
 #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__MASK			0x00ff0000
 #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT			16
-#define GEN6_CLIP_DW2_CLIPMODE__MASK				0x0000e000
-#define GEN6_CLIP_DW2_CLIPMODE__SHIFT				13
-#define GEN6_CLIP_DW2_CLIPMODE_NORMAL				(0x0 << 13)
-#define GEN6_CLIP_DW2_CLIPMODE_REJECT_ALL			(0x3 << 13)
-#define GEN6_CLIP_DW2_CLIPMODE_ACCEPT_ALL			(0x4 << 13)
+#define GEN6_CLIP_DW2_CLIP_MODE__MASK				0x0000e000
+#define GEN6_CLIP_DW2_CLIP_MODE__SHIFT				13
 #define GEN6_CLIP_DW2_PERSPECTIVE_DIVIDE_DISABLE		(0x1 << 9)
 #define GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE		(0x1 << 8)
 #define GEN6_CLIP_DW2_TRI_PROVOKE__MASK				0x00000030
@@ -911,7 +977,7 @@ enum gen_depth_format {
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__MASK			0x0001ffc0
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT			6
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__RADIX			3
-#define GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO			(0x1 << 5)
+#define GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO			(0x1 << 5)
 #define GEN6_CLIP_DW3_MAX_VPINDEX__MASK				0x0000000f
 #define GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT			0
 
@@ -927,29 +993,17 @@ enum gen_depth_format {
 #define GEN7_SF_DW1_DEPTH_OFFSET_SOLID				(0x1 << 9)
 #define GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME			(0x1 << 8)
 #define GEN7_SF_DW1_DEPTH_OFFSET_POINT				(0x1 << 7)
-#define GEN7_SF_DW1_FRONTFACE__MASK				0x00000060
-#define GEN7_SF_DW1_FRONTFACE__SHIFT				5
-#define GEN7_SF_DW1_FRONTFACE_SOLID				(0x0 << 5)
-#define GEN7_SF_DW1_FRONTFACE_WIREFRAME				(0x1 << 5)
-#define GEN7_SF_DW1_FRONTFACE_POINT				(0x2 << 5)
-#define GEN7_SF_DW1_BACKFACE__MASK				0x00000018
-#define GEN7_SF_DW1_BACKFACE__SHIFT				3
-#define GEN7_SF_DW1_BACKFACE_SOLID				(0x0 << 3)
-#define GEN7_SF_DW1_BACKFACE_WIREFRAME				(0x1 << 3)
-#define GEN7_SF_DW1_BACKFACE_POINT				(0x2 << 3)
-#define GEN7_SF_DW1_VIEWPORT_ENABLE				(0x1 << 1)
-#define GEN7_SF_DW1_FRONTWINDING__MASK				0x00000001
-#define GEN7_SF_DW1_FRONTWINDING__SHIFT				0
-#define GEN7_SF_DW1_FRONTWINDING_CW				0x0
-#define GEN7_SF_DW1_FRONTWINDING_CCW				0x1
+#define GEN7_SF_DW1_FILL_MODE_FRONT__MASK			0x00000060
+#define GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT			5
+#define GEN7_SF_DW1_FILL_MODE_BACK__MASK			0x00000018
+#define GEN7_SF_DW1_FILL_MODE_BACK__SHIFT			3
+#define GEN7_SF_DW1_VIEWPORT_TRANSFORM				(0x1 << 1)
+#define GEN7_SF_DW1_FRONT_WINDING__MASK				0x00000001
+#define GEN7_SF_DW1_FRONT_WINDING__SHIFT			0
 
 #define GEN7_SF_DW2_AA_LINE_ENABLE				(0x1 << 31)
-#define GEN7_SF_DW2_CULLMODE__MASK				0x60000000
-#define GEN7_SF_DW2_CULLMODE__SHIFT				29
-#define GEN7_SF_DW2_CULLMODE_BOTH				(0x0 << 29)
-#define GEN7_SF_DW2_CULLMODE_NONE				(0x1 << 29)
-#define GEN7_SF_DW2_CULLMODE_FRONT				(0x2 << 29)
-#define GEN7_SF_DW2_CULLMODE_BACK				(0x3 << 29)
+#define GEN7_SF_DW2_CULL_MODE__MASK				0x60000000
+#define GEN7_SF_DW2_CULL_MODE__SHIFT				29
 #define GEN7_SF_DW2_LINE_WIDTH__MASK				0x0ffc0000
 #define GEN7_SF_DW2_LINE_WIDTH__SHIFT				18
 #define GEN7_SF_DW2_LINE_WIDTH__RADIX				7
@@ -963,10 +1017,6 @@ enum gen_depth_format {
 #define GEN7_SF_DW2_SCISSOR_ENABLE				(0x1 << 11)
 #define GEN7_SF_DW2_MSRASTMODE__MASK				0x00000300
 #define GEN7_SF_DW2_MSRASTMODE__SHIFT				8
-#define GEN7_SF_DW2_MSRASTMODE_OFF_PIXEL			(0x0 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_OFF_PATTERN			(0x1 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_ON_PIXEL				(0x2 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_ON_PATTERN			(0x3 << 8)
 
 #define GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE			(0x1 << 31)
 #define GEN7_SF_DW3_TRI_PROVOKE__MASK				0x60000000
@@ -1021,14 +1071,10 @@ enum gen_depth_format {
 #define GEN8_SBE_SWIZ_CONST_0001_FLOAT				(0x1 << 9)
 #define GEN8_SBE_SWIZ_CONST_1111_FLOAT				(0x2 << 9)
 #define GEN8_SBE_SWIZ_CONST_PRIM_ID				(0x3 << 9)
-#define GEN8_SBE_SWIZ_INPUTATTR__MASK				0x000000c0
-#define GEN8_SBE_SWIZ_INPUTATTR__SHIFT				6
-#define GEN8_SBE_SWIZ_INPUTATTR_NORMAL				(0x0 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_FACING				(0x1 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_W				(0x2 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_FACING_W			(0x3 << 6)
-#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__MASK			0x0000001f
-#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__SHIFT			0
+#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__MASK			0x000000c0
+#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT			6
+#define GEN8_SBE_SWIZ_SRC_ATTR__MASK				0x0000001f
+#define GEN8_SBE_SWIZ_SRC_ATTR__SHIFT				0
 
 #define GEN6_3DSTATE_SF__SIZE					20
 
@@ -1080,31 +1126,19 @@ enum gen_depth_format {
 
 
 #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE			(0x1 << 26)
-#define GEN8_RASTER_DW1_FRONTWINDING__MASK			0x00200000
-#define GEN8_RASTER_DW1_FRONTWINDING__SHIFT			21
-#define GEN8_RASTER_DW1_FRONTWINDING_CW				(0x0 << 21)
-#define GEN8_RASTER_DW1_FRONTWINDING_CCW			(0x1 << 21)
-#define GEN8_RASTER_DW1_CULLMODE__MASK				0x00030000
-#define GEN8_RASTER_DW1_CULLMODE__SHIFT				16
-#define GEN8_RASTER_DW1_CULLMODE_BOTH				(0x0 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_NONE				(0x1 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_FRONT				(0x2 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_BACK				(0x3 << 16)
+#define GEN8_RASTER_DW1_FRONT_WINDING__MASK			0x00200000
+#define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT			21
+#define GEN8_RASTER_DW1_CULL_MODE__MASK				0x00030000
+#define GEN8_RASTER_DW1_CULL_MODE__SHIFT			16
 #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE			(0x1 << 13)
 #define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE			(0x1 << 12)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID			(0x1 << 9)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME			(0x1 << 8)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT			(0x1 << 7)
-#define GEN8_RASTER_DW1_FRONTFACE__MASK				0x00000060
-#define GEN8_RASTER_DW1_FRONTFACE__SHIFT			5
-#define GEN8_RASTER_DW1_FRONTFACE_SOLID				(0x0 << 5)
-#define GEN8_RASTER_DW1_FRONTFACE_WIREFRAME			(0x1 << 5)
-#define GEN8_RASTER_DW1_FRONTFACE_POINT				(0x2 << 5)
-#define GEN8_RASTER_DW1_BACKFACE__MASK				0x00000018
-#define GEN8_RASTER_DW1_BACKFACE__SHIFT				3
-#define GEN8_RASTER_DW1_BACKFACE_SOLID				(0x0 << 3)
-#define GEN8_RASTER_DW1_BACKFACE_WIREFRAME			(0x1 << 3)
-#define GEN8_RASTER_DW1_BACKFACE_POINT				(0x2 << 3)
+#define GEN8_RASTER_DW1_FILL_MODE_FRONT__MASK			0x00000060
+#define GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT			5
+#define GEN8_RASTER_DW1_FILL_MODE_BACK__MASK			0x00000018
+#define GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT			3
 #define GEN8_RASTER_DW1_AA_LINE_ENABLE				(0x1 << 2)
 #define GEN8_RASTER_DW1_SCISSOR_ENABLE				(0x1 << 1)
 #define GEN8_RASTER_DW1_Z_TEST_ENABLE				(0x1 << 0)
@@ -1164,14 +1198,8 @@ enum gen_depth_format {
 #define GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT			20
 #define GEN6_WM_DW6_PS_POSOFFSET__MASK				0x000c0000
 #define GEN6_WM_DW6_PS_POSOFFSET__SHIFT				18
-#define GEN6_WM_DW6_PS_POSOFFSET_NONE				(0x0 << 18)
-#define GEN6_WM_DW6_PS_POSOFFSET_CENTROID			(0x2 << 18)
-#define GEN6_WM_DW6_PS_POSOFFSET_SAMPLE				(0x3 << 18)
 #define GEN6_WM_DW6_ZW_INTERP__MASK				0x00030000
 #define GEN6_WM_DW6_ZW_INTERP__SHIFT				16
-#define GEN6_WM_DW6_ZW_INTERP_PIXEL				(0x0 << 16)
-#define GEN6_WM_DW6_ZW_INTERP_CENTROID				(0x2 << 16)
-#define GEN6_WM_DW6_ZW_INTERP_SAMPLE				(0x3 << 16)
 #define GEN6_WM_DW6_BARYCENTRIC_INTERP__MASK			0x0000fc00
 #define GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT			10
 #define GEN6_WM_DW6_POINT_RASTRULE__MASK			0x00000200
@@ -1180,10 +1208,6 @@ enum gen_depth_format {
 #define GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT			(0x1 << 9)
 #define GEN6_WM_DW6_MSRASTMODE__MASK				0x00000006
 #define GEN6_WM_DW6_MSRASTMODE__SHIFT				1
-#define GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL			(0x0 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_OFF_PATTERN			(0x1 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_ON_PIXEL				(0x2 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_ON_PATTERN			(0x3 << 1)
 #define GEN6_WM_DW6_MSDISPMODE__MASK				0x00000001
 #define GEN6_WM_DW6_MSDISPMODE__SHIFT				0
 #define GEN6_WM_DW6_MSDISPMODE_PERSAMPLE			0x0
@@ -1207,22 +1231,12 @@ enum gen_depth_format {
 #define GEN7_WM_DW1_PS_KILL_PIXEL				(0x1 << 25)
 #define GEN7_WM_DW1_PSCDEPTH__MASK				0x01800000
 #define GEN7_WM_DW1_PSCDEPTH__SHIFT				23
-#define GEN7_WM_DW1_PSCDEPTH_OFF				(0x0 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON					(0x1 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON_GE				(0x2 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON_LE				(0x3 << 23)
 #define GEN7_WM_DW1_EDSC__MASK					0x00600000
 #define GEN7_WM_DW1_EDSC__SHIFT					21
-#define GEN7_WM_DW1_EDSC_NORMAL					(0x0 << 21)
-#define GEN7_WM_DW1_EDSC_PSEXEC					(0x1 << 21)
-#define GEN7_WM_DW1_EDSC_PREPS					(0x2 << 21)
 #define GEN7_WM_DW1_PS_USE_DEPTH				(0x1 << 20)
 #define GEN7_WM_DW1_PS_USE_W					(0x1 << 19)
 #define GEN7_WM_DW1_ZW_INTERP__MASK				0x00060000
 #define GEN7_WM_DW1_ZW_INTERP__SHIFT				17
-#define GEN7_WM_DW1_ZW_INTERP_PIXEL				(0x0 << 17)
-#define GEN7_WM_DW1_ZW_INTERP_CENTROID				(0x2 << 17)
-#define GEN7_WM_DW1_ZW_INTERP_SAMPLE				(0x3 << 17)
 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK			0x0001f800
 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT			11
 #define GEN7_WM_DW1_PS_USE_COVERAGE_MASK			(0x1 << 10)
@@ -1247,10 +1261,6 @@ enum gen_depth_format {
 #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT			(0x1 << 2)
 #define GEN7_WM_DW1_MSRASTMODE__MASK				0x00000003
 #define GEN7_WM_DW1_MSRASTMODE__SHIFT				0
-#define GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL			0x0
-#define GEN7_WM_DW1_MSRASTMODE_OFF_PATTERN			0x1
-#define GEN7_WM_DW1_MSRASTMODE_ON_PIXEL				0x2
-#define GEN7_WM_DW1_MSRASTMODE_ON_PATTERN			0x3
 
 #define GEN7_WM_DW2_MSDISPMODE__MASK				0x80000000
 #define GEN7_WM_DW2_MSDISPMODE__SHIFT				31
@@ -1265,12 +1275,12 @@ enum gen_depth_format {
 #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE			4
 
 
-#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__MASK			0xe0000000
-#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__SHIFT			29
-#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__MASK			0x1c000000
-#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__SHIFT			26
-#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__MASK			0x03800000
-#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__SHIFT			23
+#define GEN8_ZS_DW1_STENCIL_FAIL_OP__MASK			0xe0000000
+#define GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT			29
+#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__MASK			0x1c000000
+#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT			26
+#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__MASK			0x03800000
+#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT			23
 #define GEN8_ZS_DW1_STENCIL1_FUNC__MASK				0x00700000
 #define GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT			20
 #define GEN8_ZS_DW1_STENCIL1_FAIL_OP__MASK			0x000e0000
@@ -1279,8 +1289,8 @@ enum gen_depth_format {
 #define GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT			14
 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__MASK			0x00003800
 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT			11
-#define GEN8_ZS_DW1_STENCIL0_FUNC__MASK				0x00000700
-#define GEN8_ZS_DW1_STENCIL0_FUNC__SHIFT			8
+#define GEN8_ZS_DW1_STENCIL_FUNC__MASK				0x00000700
+#define GEN8_ZS_DW1_STENCIL_FUNC__SHIFT				8
 #define GEN8_ZS_DW1_DEPTH_FUNC__MASK				0x000000e0
 #define GEN8_ZS_DW1_DEPTH_FUNC__SHIFT				5
 #define GEN8_ZS_DW1_STENCIL1_ENABLE				(0x1 << 4)
@@ -1289,17 +1299,17 @@ enum gen_depth_format {
 #define GEN8_ZS_DW1_DEPTH_TEST_ENABLE				(0x1 << 1)
 #define GEN8_ZS_DW1_DEPTH_WRITE_ENABLE				(0x1 << 0)
 
-#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__MASK			0xff000000
-#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__SHIFT			24
-#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__MASK			0x00ff0000
-#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__SHIFT			16
-#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__MASK			0x0000ff00
-#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__SHIFT			8
-#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__MASK			0x000000ff
-#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__SHIFT			0
-
-#define GEN9_ZS_DW3_STENCIL0_REF__MASK				0x0000ff00
-#define GEN9_ZS_DW3_STENCIL0_REF__SHIFT				8
+#define GEN8_ZS_DW2_STENCIL_TEST_MASK__MASK			0xff000000
+#define GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT			24
+#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__MASK			0x00ff0000
+#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT			16
+#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__MASK			0x0000ff00
+#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT			8
+#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__MASK			0x000000ff
+#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT			0
+
+#define GEN9_ZS_DW3_STENCIL_REF__MASK				0x0000ff00
+#define GEN9_ZS_DW3_STENCIL_REF__SHIFT				8
 #define GEN9_ZS_DW3_STENCIL1_REF__MASK				0x000000ff
 #define GEN9_ZS_DW3_STENCIL1_REF__SHIFT				0
 
@@ -1314,13 +1324,8 @@ enum gen_depth_format {
 #define GEN8_WM_HZ_DW1_FULL_SURFACE_DEPTH_CLEAR			(0x1 << 25)
 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__MASK		0x00ff0000
 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__SHIFT		16
-#define GEN8_WM_HZ_DW1_NUMSAMPLES__MASK				0x0000e000
-#define GEN8_WM_HZ_DW1_NUMSAMPLES__SHIFT			13
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_1				(0x0 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_2				(0x1 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_4				(0x2 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_8				(0x3 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_16				(0x4 << 13)
+#define GEN8_WM_HZ_DW1_NUM_SAMPLES__MASK			0x0000e000
+#define GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT			13
 
 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__MASK				0xffff0000
 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__SHIFT			16
@@ -1359,9 +1364,6 @@ enum gen_depth_format {
 #define GEN75_PS_DW4_ACCESS_UAV					(0x1 << 5)
 #define GEN7_PS_DW4_POSOFFSET__MASK				0x00000018
 #define GEN7_PS_DW4_POSOFFSET__SHIFT				3
-#define GEN7_PS_DW4_POSOFFSET_NONE				(0x0 << 3)
-#define GEN7_PS_DW4_POSOFFSET_CENTROID				(0x2 << 3)
-#define GEN7_PS_DW4_POSOFFSET_SAMPLE				(0x3 << 3)
 #define GEN7_PS_DW4_DISPATCH_MODE__MASK				0x00000007
 #define GEN7_PS_DW4_DISPATCH_MODE__SHIFT			0
 
@@ -1397,9 +1399,6 @@ enum gen_depth_format {
 #define GEN8_PS_DW6_RT_RESOLVE					(0x1 << 6)
 #define GEN8_PS_DW6_POSOFFSET__MASK				0x00000018
 #define GEN8_PS_DW6_POSOFFSET__SHIFT				3
-#define GEN8_PS_DW6_POSOFFSET_NONE				(0x0 << 3)
-#define GEN8_PS_DW6_POSOFFSET_CENTROID				(0x2 << 3)
-#define GEN8_PS_DW6_POSOFFSET_SAMPLE				(0x3 << 3)
 #define GEN8_PS_DW6_DISPATCH_MODE__MASK				0x00000007
 #define GEN8_PS_DW6_DISPATCH_MODE__SHIFT			0
 
@@ -1423,16 +1422,12 @@ enum gen_depth_format {
 #define GEN8_3DSTATE_PS_EXTRA__SIZE				2
 
 
-#define GEN8_PSX_DW1_DISPATCH_ENABLE				(0x1 << 31)
+#define GEN8_PSX_DW1_VALID					(0x1 << 31)
 #define GEN8_PSX_DW1_UAV_ONLY					(0x1 << 30)
 #define GEN8_PSX_DW1_COMPUTE_OMASK				(0x1 << 29)
 #define GEN8_PSX_DW1_KILL_PIXEL					(0x1 << 28)
 #define GEN8_PSX_DW1_PSCDEPTH__MASK				0x0c000000
 #define GEN8_PSX_DW1_PSCDEPTH__SHIFT				26
-#define GEN8_PSX_DW1_PSCDEPTH_OFF				(0x0 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON				(0x1 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON_GE				(0x2 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON_LE				(0x3 << 26)
 #define GEN8_PSX_DW1_FORCE_COMPUTE_DEPTH			(0x1 << 25)
 #define GEN8_PSX_DW1_USE_DEPTH					(0x1 << 24)
 #define GEN8_PSX_DW1_USE_W					(0x1 << 23)
@@ -1696,17 +1691,10 @@ enum gen_depth_format {
 
 
 #define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE		(0x1 << 5)
-#define GEN6_MULTISAMPLE_DW1_PIXLOC__MASK			0x00000010
-#define GEN6_MULTISAMPLE_DW1_PIXLOC__SHIFT			4
-#define GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER			(0x0 << 4)
-#define GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER			(0x1 << 4)
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__MASK			0x0000000e
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__SHIFT			1
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1			(0x0 << 1)
-#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2			(0x1 << 1)
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4			(0x2 << 1)
-#define GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8			(0x3 << 1)
-#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16			(0x4 << 1)
+#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK		0x00000010
+#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT		4
+#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK			0x0000000e
+#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT			1
 
 
 
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
index 6d815beecb3..b65b704adc6 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
@@ -84,7 +84,7 @@ enum gen_blend_function {
     GEN6_BLENDFUNCTION_MAX				      = 0x4,
 };
 
-enum gen_logicop_function {
+enum gen_logic_op {
     GEN6_LOGICOP_CLEAR					      = 0x0,
     GEN6_LOGICOP_NOR					      = 0x1,
     GEN6_LOGICOP_AND_INVERTED				      = 0x2,
@@ -103,20 +103,31 @@ enum gen_logicop_function {
     GEN6_LOGICOP_SET					      = 0xf,
 };
 
-enum gen_sampler_mip_filter {
+enum gen_mip_filter {
     GEN6_MIPFILTER_NONE					      = 0x0,
     GEN6_MIPFILTER_NEAREST				      = 0x1,
     GEN6_MIPFILTER_LINEAR				      = 0x3,
 };
 
-enum gen_sampler_map_filter {
+enum gen_map_filter {
     GEN6_MAPFILTER_NEAREST				      = 0x0,
     GEN6_MAPFILTER_LINEAR				      = 0x1,
     GEN6_MAPFILTER_ANISOTROPIC				      = 0x2,
     GEN6_MAPFILTER_MONO					      = 0x6,
 };
 
-enum gen_sampler_aniso_ratio {
+enum gen_prefilter_op {
+    GEN6_PREFILTEROP_ALWAYS				      = 0x0,
+    GEN6_PREFILTEROP_NEVER				      = 0x1,
+    GEN6_PREFILTEROP_LESS				      = 0x2,
+    GEN6_PREFILTEROP_EQUAL				      = 0x3,
+    GEN6_PREFILTEROP_LEQUAL				      = 0x4,
+    GEN6_PREFILTEROP_GREATER				      = 0x5,
+    GEN6_PREFILTEROP_NOTEQUAL				      = 0x6,
+    GEN6_PREFILTEROP_GEQUAL				      = 0x7,
+};
+
+enum gen_aniso_ratio {
     GEN6_ANISORATIO_2					      = 0x0,
     GEN6_ANISORATIO_4					      = 0x1,
     GEN6_ANISORATIO_6					      = 0x2,
@@ -127,7 +138,7 @@ enum gen_sampler_aniso_ratio {
     GEN6_ANISORATIO_16					      = 0x7,
 };
 
-enum gen_sampler_texcoord_mode {
+enum gen_texcoord_mode {
     GEN6_TEXCOORDMODE_WRAP				      = 0x0,
     GEN6_TEXCOORDMODE_MIRROR				      = 0x1,
     GEN6_TEXCOORDMODE_CLAMP				      = 0x2,
@@ -137,15 +148,15 @@ enum gen_sampler_texcoord_mode {
     GEN8_TEXCOORDMODE_HALF_BORDER			      = 0x6,
 };
 
-enum gen_sampler_key_filter {
+enum gen_key_filter {
     GEN6_KEYFILTER_KILL_ON_ANY_MATCH			      = 0x0,
     GEN6_KEYFILTER_REPLACE_BLACK			      = 0x1,
 };
 
 #define GEN6_COLOR_CALC_STATE__SIZE				6
 
-#define GEN6_CC_DW0_STENCIL0_REF__MASK				0xff000000
-#define GEN6_CC_DW0_STENCIL0_REF__SHIFT				24
+#define GEN6_CC_DW0_STENCIL_REF__MASK				0xff000000
+#define GEN6_CC_DW0_STENCIL_REF__SHIFT				24
 #define GEN6_CC_DW0_STENCIL1_REF__MASK				0x00ff0000
 #define GEN6_CC_DW0_STENCIL1_REF__SHIFT				16
 #define GEN6_CC_DW0_ROUND_DISABLE_DISABLE			(0x1 << 15)
@@ -162,14 +173,14 @@ enum gen_sampler_key_filter {
 #define GEN6_DEPTH_STENCIL_STATE__SIZE				3
 
 #define GEN6_ZS_DW0_STENCIL_TEST_ENABLE				(0x1 << 31)
-#define GEN6_ZS_DW0_STENCIL0_FUNC__MASK				0x70000000
-#define GEN6_ZS_DW0_STENCIL0_FUNC__SHIFT			28
-#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__MASK			0x0e000000
-#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__SHIFT			25
-#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__MASK			0x01c00000
-#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__SHIFT			22
-#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__MASK			0x00380000
-#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__SHIFT			19
+#define GEN6_ZS_DW0_STENCIL_FUNC__MASK				0x70000000
+#define GEN6_ZS_DW0_STENCIL_FUNC__SHIFT				28
+#define GEN6_ZS_DW0_STENCIL_FAIL_OP__MASK			0x0e000000
+#define GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT			25
+#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__MASK			0x01c00000
+#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT			22
+#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__MASK			0x00380000
+#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT			19
 #define GEN6_ZS_DW0_STENCIL_WRITE_ENABLE			(0x1 << 18)
 #define GEN6_ZS_DW0_STENCIL1_ENABLE				(0x1 << 15)
 #define GEN6_ZS_DW0_STENCIL1_FUNC__MASK				0x00007000
@@ -181,14 +192,14 @@ enum gen_sampler_key_filter {
 #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__MASK			0x00000038
 #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT			3
 
-#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__MASK			0xff000000
-#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__SHIFT			24
-#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__MASK			0x00ff0000
-#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__SHIFT			16
-#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__MASK			0x0000ff00
-#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__SHIFT			8
-#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__MASK			0x000000ff
-#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__SHIFT			0
+#define GEN6_ZS_DW1_STENCIL_TEST_MASK__MASK			0xff000000
+#define GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT			24
+#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__MASK			0x00ff0000
+#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT			16
+#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__MASK			0x0000ff00
+#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT			8
+#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__MASK			0x000000ff
+#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT			0
 
 #define GEN6_ZS_DW2_DEPTH_TEST_ENABLE				(0x1 << 31)
 #define GEN6_ZS_DW2_DEPTH_FUNC__MASK				0x38000000
@@ -216,10 +227,12 @@ enum gen_sampler_key_filter {
 #define GEN6_RT_DW1_ALPHA_TO_COVERAGE				(0x1 << 31)
 #define GEN6_RT_DW1_ALPHA_TO_ONE				(0x1 << 30)
 #define GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER			(0x1 << 29)
-#define GEN6_RT_DW1_WRITE_DISABLE_A				(0x1 << 27)
-#define GEN6_RT_DW1_WRITE_DISABLE_R				(0x1 << 26)
-#define GEN6_RT_DW1_WRITE_DISABLE_G				(0x1 << 25)
-#define GEN6_RT_DW1_WRITE_DISABLE_B				(0x1 << 24)
+#define GEN6_RT_DW1_WRITE_DISABLES__MASK			0x0f000000
+#define GEN6_RT_DW1_WRITE_DISABLES__SHIFT			24
+#define GEN6_RT_DW1_WRITE_DISABLES_A				(0x1 << 27)
+#define GEN6_RT_DW1_WRITE_DISABLES_R				(0x1 << 26)
+#define GEN6_RT_DW1_WRITE_DISABLES_G				(0x1 << 25)
+#define GEN6_RT_DW1_WRITE_DISABLES_B				(0x1 << 24)
 #define GEN6_RT_DW1_LOGICOP_ENABLE				(0x1 << 22)
 #define GEN6_RT_DW1_LOGICOP_FUNC__MASK				0x003c0000
 #define GEN6_RT_DW1_LOGICOP_FUNC__SHIFT				18
@@ -267,10 +280,12 @@ enum gen_sampler_key_filter {
 #define GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT			8
 #define GEN8_RT_DW0_ALPHA_FUNC__MASK				0x000000e0
 #define GEN8_RT_DW0_ALPHA_FUNC__SHIFT				5
-#define GEN8_RT_DW0_WRITE_DISABLE_A				(0x1 << 3)
-#define GEN8_RT_DW0_WRITE_DISABLE_R				(0x1 << 2)
-#define GEN8_RT_DW0_WRITE_DISABLE_G				(0x1 << 1)
-#define GEN8_RT_DW0_WRITE_DISABLE_B				(0x1 << 0)
+#define GEN8_RT_DW0_WRITE_DISABLES__MASK			0x0000000f
+#define GEN8_RT_DW0_WRITE_DISABLES__SHIFT			0
+#define GEN8_RT_DW0_WRITE_DISABLES_A				(0x1 << 3)
+#define GEN8_RT_DW0_WRITE_DISABLES_R				(0x1 << 2)
+#define GEN8_RT_DW0_WRITE_DISABLES_G				(0x1 << 1)
+#define GEN8_RT_DW0_WRITE_DISABLES_B				(0x1 << 0)
 
 #define GEN8_RT_DW1_LOGICOP_ENABLE				(0x1 << 31)
 #define GEN8_RT_DW1_LOGICOP_FUNC__MASK				0x78000000
@@ -419,6 +434,7 @@ enum gen_sampler_key_filter {
 #define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT		27
 #define GEN6_SAMPLER_DW0_BASE_LOD__MASK				0x07c00000
 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT			22
+#define GEN6_SAMPLER_DW0_BASE_LOD__RADIX			1
 #define GEN6_SAMPLER_DW0_MIP_FILTER__MASK			0x00300000
 #define GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT			20
 #define GEN6_SAMPLER_DW0_MAG_FILTER__MASK			0x000e0000
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
index 7c2349f2447..b5d09f64429 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
@@ -299,7 +299,10 @@ enum gen_surface_scs {
 #define GEN6_SURFACE_DW0_MIPLAYOUT__SHIFT			10
 #define GEN6_SURFACE_DW0_MIPLAYOUT_BELOW			(0x0 << 10)
 #define GEN6_SURFACE_DW0_MIPLAYOUT_RIGHT			(0x1 << 10)
-#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE			(0x1 << 9)
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__MASK		0x00000200
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__SHIFT		9
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_REPLICATE		(0x0 << 9)
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE		(0x1 << 9)
 #define GEN6_SURFACE_DW0_RENDER_CACHE_RW			(0x1 << 8)
 #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK	0x000000c0
 #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT	6
@@ -485,6 +488,8 @@ enum gen_surface_scs {
 #define GEN7_SURFACE_DW7_CC_B__SHIFT				29
 #define GEN7_SURFACE_DW7_CC_A__MASK				0x10000000
 #define GEN7_SURFACE_DW7_CC_A__SHIFT				28
+#define GEN75_SURFACE_DW7_SCS__MASK				0x0fff0000
+#define GEN75_SURFACE_DW7_SCS__SHIFT				16
 #define GEN75_SURFACE_DW7_SCS_R__MASK				0x0e000000
 #define GEN75_SURFACE_DW7_SCS_R__SHIFT				25
 #define GEN75_SURFACE_DW7_SCS_G__MASK				0x01c00000
diff --git a/src/gallium/drivers/ilo/genhw/genhw.h b/src/gallium/drivers/ilo/genhw/genhw.h
index 9e05bf5beca..3a777a18c2a 100644
--- a/src/gallium/drivers/ilo/genhw/genhw.h
+++ b/src/gallium/drivers/ilo/genhw/genhw.h
@@ -1,6 +1,4 @@
 /*
- * Mesa 3-D graphics library
- *
  * Copyright (C) 2014 LunarG, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,8 +23,9 @@
 #ifndef GENHW_H
 #define GENHW_H
 
-#include "pipe/p_compiler.h"
-#include "util/u_debug.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include <assert.h>
 
 #include "gen_regs.xml.h"
 #include "gen_mi.xml.h"
diff --git a/src/gallium/drivers/ilo/ilo_blitter.h b/src/gallium/drivers/ilo/ilo_blitter.h
index 4284f415c1c..4eba8481c28 100644
--- a/src/gallium/drivers/ilo/ilo_blitter.h
+++ b/src/gallium/drivers/ilo/ilo_blitter.h
@@ -39,12 +39,6 @@ enum ilo_blitter_uses {
    ILO_BLITTER_USE_FB_STENCIL    = 1 << 4,
 };
 
-enum ilo_blitter_rectlist_op {
-   ILO_BLITTER_RECTLIST_CLEAR_ZS,
-   ILO_BLITTER_RECTLIST_RESOLVE_Z,
-   ILO_BLITTER_RECTLIST_RESOLVE_HIZ,
-};
-
 struct blitter_context;
 struct pipe_resource;
 struct pipe_surface;
@@ -57,30 +51,42 @@ struct ilo_blitter {
    /*
     * A minimal context with the goal to send RECTLISTs down the pipeline.
     */
-   enum ilo_blitter_rectlist_op op;
+   enum ilo_state_raster_earlyz_op earlyz_op;
+   bool earlyz_stencil_clear;
    uint32_t uses;
 
    bool initialized;
 
    float vertices[3][2];
-   struct ilo_ve_state ve;
-   struct pipe_draw_info draw;
+   struct gen6_3dprimitive_info draw_info;
 
-   struct ilo_viewport_cso viewport;
-   struct ilo_dsa_state dsa;
+   uint32_t vf_data[4];
+   struct ilo_state_vf vf;
 
-   struct {
-      struct pipe_stencil_ref stencil_ref;
-      ubyte alpha_ref;
-      struct pipe_blend_color blend_color;
-   } cc;
+   struct ilo_state_vs vs;
+   struct ilo_state_hs hs;
+   struct ilo_state_ds ds;
+   struct ilo_state_gs gs;
+
+   struct ilo_state_sol sol;
+
+   struct ilo_state_viewport vp;
+   uint32_t vp_data[20];
+
+   struct ilo_state_sbe sbe;
+   struct ilo_state_ps ps;
+   struct ilo_state_cc cc;
 
    uint32_t depth_clear_value;
 
+   struct ilo_state_urb urb;
+
    struct {
       struct ilo_surface_cso dst;
       unsigned width, height;
       unsigned num_samples;
+
+      struct ilo_state_raster rs;
    } fb;
 };
 
diff --git a/src/gallium/drivers/ilo/ilo_blitter_pipe.c b/src/gallium/drivers/ilo/ilo_blitter_pipe.c
index c4c02bd3e53..0bfe7827f11 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_pipe.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_pipe.c
@@ -63,7 +63,7 @@ ilo_blitter_pipe_begin(struct ilo_blitter *blitter,
    util_blitter_save_viewport(b, &vec->viewport.viewport0);
 
    if (scissor_enable)
-      util_blitter_save_scissor(b, &vec->scissor.scissor0);
+      util_blitter_save_scissor(b, &vec->viewport.scissor0);
 
    switch (op) {
    case ILO_BLITTER_PIPE_BLIT:
diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
index 6d8afed9dca..13c8f500680 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
@@ -25,7 +25,6 @@
  *    Chia-I Wu <[email protected]>
  */
 
-#include "core/ilo_state_3d.h"
 #include "util/u_draw.h"
 #include "util/u_pack_color.h"
 
@@ -40,45 +39,48 @@
 static bool
 ilo_blitter_set_invariants(struct ilo_blitter *blitter)
 {
-   struct pipe_vertex_element velem;
-   struct pipe_viewport_state vp;
+   struct ilo_state_vf_element_info elem;
 
    if (blitter->initialized)
       return true;
 
+   /* a rectangle has 3 vertices in a RECTLIST */
+   blitter->draw_info.topology = GEN6_3DPRIM_RECTLIST;
+   blitter->draw_info.vertex_count = 3;
+   blitter->draw_info.instance_count = 1;
+
+   memset(&elem, 0, sizeof(elem));
    /* only vertex X and Y */
-   memset(&velem, 0, sizeof(velem));
-   velem.src_format = PIPE_FORMAT_R32G32_FLOAT;
-   ilo_gpe_init_ve(blitter->ilo->dev, 1, &velem, &blitter->ve);
-
-   /* generate VUE header */
-   ilo_gpe_init_ve_nosrc(blitter->ilo->dev,
-         GEN6_VFCOMP_STORE_0, /* Reserved */
-         GEN6_VFCOMP_STORE_0, /* Render Target Array Index */
-         GEN6_VFCOMP_STORE_0, /* Viewport Index */
-         GEN6_VFCOMP_STORE_0, /* Point Width */
-         &blitter->ve.nosrc_cso);
-   blitter->ve.prepend_nosrc_cso = true;
+   elem.format = GEN6_FORMAT_R32G32_FLOAT;
+   elem.format_size = 8;
+   elem.component_count = 2;
 
-   /* a rectangle has 3 vertices in a RECTLIST */
-   util_draw_init_info(&blitter->draw);
-   blitter->draw.mode = ILO_PRIM_RECTANGLES;
-   blitter->draw.count = 3;
+   ilo_state_vf_init_for_rectlist(&blitter->vf, blitter->ilo->dev,
+         blitter->vf_data, sizeof(blitter->vf_data), &elem, 1);
+
+   ilo_state_vs_init_disabled(&blitter->vs, blitter->ilo->dev);
+   ilo_state_hs_init_disabled(&blitter->hs, blitter->ilo->dev);
+   ilo_state_ds_init_disabled(&blitter->ds, blitter->ilo->dev);
+   ilo_state_gs_init_disabled(&blitter->gs, blitter->ilo->dev);
+   ilo_state_sol_init_disabled(&blitter->sol, blitter->ilo->dev, false);
 
    /**
     * From the Haswell PRM, volume 7, page 615:
     *
     *     "The clear value must be between the min and max depth values
-    *     (inclusive) defined in the CC_VIEWPORT."
+    *      (inclusive) defined in the CC_VIEWPORT."
     *
     * Even though clipping and viewport transformation will be disabled, we
     * still need to set up the viewport states.
     */
-   memset(&vp, 0, sizeof(vp));
-   vp.scale[0] = 1.0f;
-   vp.scale[1] = 1.0f;
-   vp.scale[2] = 1.0f;
-   ilo_gpe_set_viewport_cso(blitter->ilo->dev, &vp, &blitter->viewport);
+   ilo_state_viewport_init_for_rectlist(&blitter->vp, blitter->ilo->dev,
+         blitter->vp_data, sizeof(blitter->vp_data));
+
+   ilo_state_sbe_init_for_rectlist(&blitter->sbe, blitter->ilo->dev, 0, 0);
+   ilo_state_ps_init_disabled(&blitter->ps, blitter->ilo->dev);
+
+   ilo_state_urb_init_for_rectlist(&blitter->urb, blitter->ilo->dev,
+         ilo_state_vf_get_attr_count(&blitter->vf));
 
    blitter->initialized = true;
 
@@ -86,10 +88,12 @@ ilo_blitter_set_invariants(struct ilo_blitter *blitter)
 }
 
 static void
-ilo_blitter_set_op(struct ilo_blitter *blitter,
-                   enum ilo_blitter_rectlist_op op)
+ilo_blitter_set_earlyz_op(struct ilo_blitter *blitter,
+                          enum ilo_state_raster_earlyz_op op,
+                          bool earlyz_stencil_clear)
 {
-   blitter->op = op;
+   blitter->earlyz_op = op;
+   blitter->earlyz_stencil_clear = earlyz_stencil_clear;
 }
 
 /**
@@ -117,18 +121,27 @@ ilo_blitter_set_rectlist(struct ilo_blitter *blitter,
 }
 
 static void
-ilo_blitter_set_clear_values(struct ilo_blitter *blitter,
-                             uint32_t depth, ubyte stencil)
+ilo_blitter_set_depth_clear_value(struct ilo_blitter *blitter,
+                                  uint32_t depth)
 {
    blitter->depth_clear_value = depth;
-   blitter->cc.stencil_ref.ref_value[0] = stencil;
 }
 
 static void
-ilo_blitter_set_dsa(struct ilo_blitter *blitter,
-                    const struct pipe_depth_stencil_alpha_state *state)
+ilo_blitter_set_cc(struct ilo_blitter *blitter,
+                   const struct ilo_state_cc_info *info)
+{
+   memset(&blitter->cc, 0, sizeof(blitter->cc));
+   ilo_state_cc_init(&blitter->cc, blitter->ilo->dev, info);
+}
+
+static void
+ilo_blitter_set_fb_rs(struct ilo_blitter *blitter)
 {
-   ilo_gpe_init_dsa(blitter->ilo->dev, state, &blitter->dsa);
+   memset(&blitter->fb.rs, 0, sizeof(blitter->fb.rs));
+   ilo_state_raster_init_for_rectlist(&blitter->fb.rs, blitter->ilo->dev,
+         blitter->fb.num_samples, blitter->earlyz_op,
+         blitter->earlyz_stencil_clear);
 }
 
 static void
@@ -146,6 +159,8 @@ ilo_blitter_set_fb(struct ilo_blitter *blitter,
       blitter->fb.num_samples = 1;
 
    memcpy(&blitter->fb.dst, cso, sizeof(*cso));
+
+   ilo_blitter_set_fb_rs(blitter);
 }
 
 static void
@@ -191,9 +206,9 @@ hiz_align_fb(struct ilo_blitter *blitter)
 {
    unsigned align_w, align_h;
 
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
+   switch (blitter->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
       break;
    default:
       return;
@@ -328,7 +343,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
                               double depth, unsigned stencil)
 {
    struct ilo_texture *tex = ilo_texture(zs->texture);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
    uint32_t uses, clear_value;
 
    if (!ilo_image_can_enable_aux(&tex->image, zs->u.tex.level))
@@ -368,17 +383,20 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
     *      - [DevSNB] errata: For stencil buffer only clear, the previous
     *        depth clear value must be delivered during the clear."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
+   memset(&info, 0, sizeof(info));
 
-   if (clear_flags & PIPE_CLEAR_DEPTH)
-      dsa_state.depth.writemask = true;
+   if (clear_flags & PIPE_CLEAR_DEPTH) {
+      info.depth.cv_has_buffer = true;
+      info.depth.write_enable = true;
+   }
 
    if (clear_flags & PIPE_CLEAR_STENCIL) {
-      dsa_state.stencil[0].enabled = true;
-      dsa_state.stencil[0].func = PIPE_FUNC_ALWAYS;
-      dsa_state.stencil[0].fail_op = PIPE_STENCIL_OP_KEEP;
-      dsa_state.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
-      dsa_state.stencil[0].zfail_op = PIPE_STENCIL_OP_KEEP;
+      info.stencil.cv_has_buffer = true;
+      info.stencil.test_enable = true;
+      info.stencil.front.test_func = GEN6_COMPAREFUNCTION_ALWAYS;
+      info.stencil.front.fail_op = GEN6_STENCILOP_KEEP;
+      info.stencil.front.zfail_op = GEN6_STENCILOP_KEEP;
+      info.stencil.front.zpass_op = GEN6_STENCILOP_REPLACE;
 
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 277:
@@ -389,18 +407,21 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
        *      - DEPTH_STENCIL_STATE::Stencil Test Mask must be 0xFF
        *      - DEPTH_STENCIL_STATE::Back Face Stencil Write Mask must be 0xFF
        *      - DEPTH_STENCIL_STATE::Back Face Stencil Test Mask must be 0xFF"
+       *
+       * Back frace masks will be copied from front face masks.
        */
-      dsa_state.stencil[0].valuemask = 0xff;
-      dsa_state.stencil[0].writemask = 0xff;
-      dsa_state.stencil[1].valuemask = 0xff;
-      dsa_state.stencil[1].writemask = 0xff;
+      info.params.stencil_front.test_ref = (uint8_t) stencil;
+      info.params.stencil_front.test_mask = 0xff;
+      info.params.stencil_front.write_mask = 0xff;
    }
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_CLEAR_ZS);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR,
+         clear_flags & PIPE_CLEAR_STENCIL);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
-   ilo_blitter_set_clear_values(blitter, clear_value, (ubyte) stencil);
+   ilo_blitter_set_cc(blitter, &info);
+   ilo_blitter_set_depth_clear_value(blitter, clear_value);
    ilo_blitter_set_fb_from_surface(blitter, zs);
 
    uses = ILO_BLITTER_USE_DSA;
@@ -421,7 +442,7 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter,
                                unsigned level, unsigned slice)
 {
    struct ilo_texture *tex = ilo_texture(res);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
    const struct ilo_texture_slice *s =
       ilo_texture_get_slice(tex, level, slice);
 
@@ -435,16 +456,18 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter,
     *      to NEVER. Depth Buffer Write Enable must be enabled. Stencil Test
     *      Enable and Stencil Buffer Write Enable must be disabled."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
-   dsa_state.depth.writemask = true;
-   dsa_state.depth.enabled = true;
-   dsa_state.depth.func = PIPE_FUNC_NEVER;
+   memset(&info, 0, sizeof(info));
+   info.depth.cv_has_buffer = true;
+   info.depth.test_enable = true;
+   info.depth.write_enable = true;
+   info.depth.test_func = GEN6_COMPAREFUNCTION_NEVER;
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_Z);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE, false);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
-   ilo_blitter_set_clear_values(blitter, s->clear_value, 0);
+   ilo_blitter_set_cc(blitter, &info);
+   ilo_blitter_set_depth_clear_value(blitter, s->clear_value);
    ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice);
    ilo_blitter_set_uses(blitter,
          ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH);
@@ -458,7 +481,7 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter,
                                  unsigned level, unsigned slice)
 {
    struct ilo_texture *tex = ilo_texture(res);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
 
    if (!ilo_image_can_enable_aux(&tex->image, level))
       return;
@@ -470,13 +493,15 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter,
     *      disabled. Depth Buffer Write Enable must be enabled. Stencil Test
     *      Enable and Stencil Buffer Write Enable must be disabled."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
-   dsa_state.depth.writemask = true;
+   memset(&info, 0, sizeof(info));
+   info.depth.cv_has_buffer = true;
+   info.depth.write_enable = true;
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_HIZ);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE, false);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
+   ilo_blitter_set_cc(blitter, &info);
    ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice);
    ilo_blitter_set_uses(blitter,
          ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH);
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index fc91fd312d2..e8e1a4cd14c 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -452,12 +452,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
    } u;
 
    /* we will draw with IB mapped */
-   if (ib->buffer) {
-      u.ptr = intel_bo_map(ilo_buffer(ib->buffer)->bo, false);
+   if (ib->state.buffer) {
+      u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false);
       if (u.ptr)
-         u.u8 += ib->offset;
+         u.u8 += ib->state.offset;
    } else {
-      u.ptr = ib->user_buffer;
+      u.ptr = ib->state.user_buffer;
    }
 
    if (!u.ptr)
@@ -483,7 +483,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
       (pipe)->draw_vbo(pipe, &subinfo);                  \
 } while (0)
 
-   switch (ib->index_size) {
+   switch (ib->state.index_size) {
    case 1:
       DRAW_VBO_WITH_SW_RESTART(&ilo->base, info, u.u8);
       break;
@@ -500,8 +500,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
 #undef DRAW_VBO_WITH_SW_RESTART
 
-   if (ib->buffer)
-      intel_bo_unmap(ilo_buffer(ib->buffer)->bo);
+   if (ib->state.buffer)
+      intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo);
 }
 
 static bool
@@ -511,9 +511,9 @@ draw_vbo_need_sw_restart(const struct ilo_context *ilo,
    /* the restart index is fixed prior to GEN7.5 */
    if (ilo_dev_gen(ilo->dev) < ILO_GEN(7.5)) {
       const unsigned cut_index =
-         (ilo->state_vector.ib.index_size == 1) ? 0xff :
-         (ilo->state_vector.ib.index_size == 2) ? 0xffff :
-         (ilo->state_vector.ib.index_size == 4) ? 0xffffffff : 0;
+         (ilo->state_vector.ib.state.index_size == 1) ? 0xff :
+         (ilo->state_vector.ib.state.index_size == 2) ? 0xffff :
+         (ilo->state_vector.ib.state.index_size == 4) ? 0xffffffff : 0;
 
       if (info->restart_index < cut_index)
          return true;
diff --git a/src/gallium/drivers/ilo/ilo_format.c b/src/gallium/drivers/ilo/ilo_format.c
new file mode 100644
index 00000000000..ca7e6b55ca1
--- /dev/null
+++ b/src/gallium/drivers/ilo/ilo_format.c
@@ -0,0 +1,356 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2013 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "genhw/genhw.h"
+#include "core/ilo_state_surface.h"
+#include "core/ilo_state_vf.h"
+#include "ilo_format.h"
+
+bool
+ilo_format_support_vb(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
+
+   return (idx >= 0 && ilo_state_vf_valid_element_format(dev, idx));
+}
+
+bool
+ilo_format_support_sol(const struct ilo_dev *dev,
+                       enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_DP_SVB, idx));
+}
+
+bool
+ilo_format_support_sampler(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_SAMPLER, idx));
+}
+
+bool
+ilo_format_support_rt(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_DP_RENDER, idx));
+}
+
+bool
+ilo_format_support_zs(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return true;
+   case PIPE_FORMAT_S8_UINT:
+      /* TODO separate stencil */
+   default:
+      return false;
+   }
+}
+
+/**
+ * Translate a color (non-depth/stencil) pipe format to the matching hardware
+ * format.  Return -1 on errors.
+ */
+int
+ilo_format_translate_color(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   static const int format_mapping[PIPE_FORMAT_COUNT] = {
+      [PIPE_FORMAT_NONE]                  = 0,
+      [PIPE_FORMAT_B8G8R8A8_UNORM]        = GEN6_FORMAT_B8G8R8A8_UNORM,
+      [PIPE_FORMAT_B8G8R8X8_UNORM]        = GEN6_FORMAT_B8G8R8X8_UNORM,
+      [PIPE_FORMAT_A8R8G8B8_UNORM]        = 0,
+      [PIPE_FORMAT_X8R8G8B8_UNORM]        = 0,
+      [PIPE_FORMAT_B5G5R5A1_UNORM]        = GEN6_FORMAT_B5G5R5A1_UNORM,
+      [PIPE_FORMAT_B4G4R4A4_UNORM]        = GEN6_FORMAT_B4G4R4A4_UNORM,
+      [PIPE_FORMAT_B5G6R5_UNORM]          = GEN6_FORMAT_B5G6R5_UNORM,
+      [PIPE_FORMAT_R10G10B10A2_UNORM]     = GEN6_FORMAT_R10G10B10A2_UNORM,
+      [PIPE_FORMAT_L8_UNORM]              = GEN6_FORMAT_L8_UNORM,
+      [PIPE_FORMAT_A8_UNORM]              = GEN6_FORMAT_A8_UNORM,
+      [PIPE_FORMAT_I8_UNORM]              = GEN6_FORMAT_I8_UNORM,
+      [PIPE_FORMAT_L8A8_UNORM]            = GEN6_FORMAT_L8A8_UNORM,
+      [PIPE_FORMAT_L16_UNORM]             = GEN6_FORMAT_L16_UNORM,
+      [PIPE_FORMAT_UYVY]                  = GEN6_FORMAT_YCRCB_SWAPUVY,
+      [PIPE_FORMAT_YUYV]                  = GEN6_FORMAT_YCRCB_NORMAL,
+      [PIPE_FORMAT_Z16_UNORM]             = 0,
+      [PIPE_FORMAT_Z32_UNORM]             = 0,
+      [PIPE_FORMAT_Z32_FLOAT]             = 0,
+      [PIPE_FORMAT_Z24_UNORM_S8_UINT]     = 0,
+      [PIPE_FORMAT_S8_UINT_Z24_UNORM]     = 0,
+      [PIPE_FORMAT_Z24X8_UNORM]           = 0,
+      [PIPE_FORMAT_X8Z24_UNORM]           = 0,
+      [PIPE_FORMAT_S8_UINT]               = 0,
+      [PIPE_FORMAT_R64_FLOAT]             = GEN6_FORMAT_R64_FLOAT,
+      [PIPE_FORMAT_R64G64_FLOAT]          = GEN6_FORMAT_R64G64_FLOAT,
+      [PIPE_FORMAT_R64G64B64_FLOAT]       = GEN6_FORMAT_R64G64B64_FLOAT,
+      [PIPE_FORMAT_R64G64B64A64_FLOAT]    = GEN6_FORMAT_R64G64B64A64_FLOAT,
+      [PIPE_FORMAT_R32_FLOAT]             = GEN6_FORMAT_R32_FLOAT,
+      [PIPE_FORMAT_R32G32_FLOAT]          = GEN6_FORMAT_R32G32_FLOAT,
+      [PIPE_FORMAT_R32G32B32_FLOAT]       = GEN6_FORMAT_R32G32B32_FLOAT,
+      [PIPE_FORMAT_R32G32B32A32_FLOAT]    = GEN6_FORMAT_R32G32B32A32_FLOAT,
+      [PIPE_FORMAT_R32_UNORM]             = GEN6_FORMAT_R32_UNORM,
+      [PIPE_FORMAT_R32G32_UNORM]          = GEN6_FORMAT_R32G32_UNORM,
+      [PIPE_FORMAT_R32G32B32_UNORM]       = GEN6_FORMAT_R32G32B32_UNORM,
+      [PIPE_FORMAT_R32G32B32A32_UNORM]    = GEN6_FORMAT_R32G32B32A32_UNORM,
+      [PIPE_FORMAT_R32_USCALED]           = GEN6_FORMAT_R32_USCALED,
+      [PIPE_FORMAT_R32G32_USCALED]        = GEN6_FORMAT_R32G32_USCALED,
+      [PIPE_FORMAT_R32G32B32_USCALED]     = GEN6_FORMAT_R32G32B32_USCALED,
+      [PIPE_FORMAT_R32G32B32A32_USCALED]  = GEN6_FORMAT_R32G32B32A32_USCALED,
+      [PIPE_FORMAT_R32_SNORM]             = GEN6_FORMAT_R32_SNORM,
+      [PIPE_FORMAT_R32G32_SNORM]          = GEN6_FORMAT_R32G32_SNORM,
+      [PIPE_FORMAT_R32G32B32_SNORM]       = GEN6_FORMAT_R32G32B32_SNORM,
+      [PIPE_FORMAT_R32G32B32A32_SNORM]    = GEN6_FORMAT_R32G32B32A32_SNORM,
+      [PIPE_FORMAT_R32_SSCALED]           = GEN6_FORMAT_R32_SSCALED,
+      [PIPE_FORMAT_R32G32_SSCALED]        = GEN6_FORMAT_R32G32_SSCALED,
+      [PIPE_FORMAT_R32G32B32_SSCALED]     = GEN6_FORMAT_R32G32B32_SSCALED,
+      [PIPE_FORMAT_R32G32B32A32_SSCALED]  = GEN6_FORMAT_R32G32B32A32_SSCALED,
+      [PIPE_FORMAT_R16_UNORM]             = GEN6_FORMAT_R16_UNORM,
+      [PIPE_FORMAT_R16G16_UNORM]          = GEN6_FORMAT_R16G16_UNORM,
+      [PIPE_FORMAT_R16G16B16_UNORM]       = GEN6_FORMAT_R16G16B16_UNORM,
+      [PIPE_FORMAT_R16G16B16A16_UNORM]    = GEN6_FORMAT_R16G16B16A16_UNORM,
+      [PIPE_FORMAT_R16_USCALED]           = GEN6_FORMAT_R16_USCALED,
+      [PIPE_FORMAT_R16G16_USCALED]        = GEN6_FORMAT_R16G16_USCALED,
+      [PIPE_FORMAT_R16G16B16_USCALED]     = GEN6_FORMAT_R16G16B16_USCALED,
+      [PIPE_FORMAT_R16G16B16A16_USCALED]  = GEN6_FORMAT_R16G16B16A16_USCALED,
+      [PIPE_FORMAT_R16_SNORM]             = GEN6_FORMAT_R16_SNORM,
+      [PIPE_FORMAT_R16G16_SNORM]          = GEN6_FORMAT_R16G16_SNORM,
+      [PIPE_FORMAT_R16G16B16_SNORM]       = GEN6_FORMAT_R16G16B16_SNORM,
+      [PIPE_FORMAT_R16G16B16A16_SNORM]    = GEN6_FORMAT_R16G16B16A16_SNORM,
+      [PIPE_FORMAT_R16_SSCALED]           = GEN6_FORMAT_R16_SSCALED,
+      [PIPE_FORMAT_R16G16_SSCALED]        = GEN6_FORMAT_R16G16_SSCALED,
+      [PIPE_FORMAT_R16G16B16_SSCALED]     = GEN6_FORMAT_R16G16B16_SSCALED,
+      [PIPE_FORMAT_R16G16B16A16_SSCALED]  = GEN6_FORMAT_R16G16B16A16_SSCALED,
+      [PIPE_FORMAT_R8_UNORM]              = GEN6_FORMAT_R8_UNORM,
+      [PIPE_FORMAT_R8G8_UNORM]            = GEN6_FORMAT_R8G8_UNORM,
+      [PIPE_FORMAT_R8G8B8_UNORM]          = GEN6_FORMAT_R8G8B8_UNORM,
+      [PIPE_FORMAT_R8G8B8A8_UNORM]        = GEN6_FORMAT_R8G8B8A8_UNORM,
+      [PIPE_FORMAT_X8B8G8R8_UNORM]        = 0,
+      [PIPE_FORMAT_R8_USCALED]            = GEN6_FORMAT_R8_USCALED,
+      [PIPE_FORMAT_R8G8_USCALED]          = GEN6_FORMAT_R8G8_USCALED,
+      [PIPE_FORMAT_R8G8B8_USCALED]        = GEN6_FORMAT_R8G8B8_USCALED,
+      [PIPE_FORMAT_R8G8B8A8_USCALED]      = GEN6_FORMAT_R8G8B8A8_USCALED,
+      [PIPE_FORMAT_R8_SNORM]              = GEN6_FORMAT_R8_SNORM,
+      [PIPE_FORMAT_R8G8_SNORM]            = GEN6_FORMAT_R8G8_SNORM,
+      [PIPE_FORMAT_R8G8B8_SNORM]          = GEN6_FORMAT_R8G8B8_SNORM,
+      [PIPE_FORMAT_R8G8B8A8_SNORM]        = GEN6_FORMAT_R8G8B8A8_SNORM,
+      [PIPE_FORMAT_R8_SSCALED]            = GEN6_FORMAT_R8_SSCALED,
+      [PIPE_FORMAT_R8G8_SSCALED]          = GEN6_FORMAT_R8G8_SSCALED,
+      [PIPE_FORMAT_R8G8B8_SSCALED]        = GEN6_FORMAT_R8G8B8_SSCALED,
+      [PIPE_FORMAT_R8G8B8A8_SSCALED]      = GEN6_FORMAT_R8G8B8A8_SSCALED,
+      [PIPE_FORMAT_R32_FIXED]             = GEN6_FORMAT_R32_SFIXED,
+      [PIPE_FORMAT_R32G32_FIXED]          = GEN6_FORMAT_R32G32_SFIXED,
+      [PIPE_FORMAT_R32G32B32_FIXED]       = GEN6_FORMAT_R32G32B32_SFIXED,
+      [PIPE_FORMAT_R32G32B32A32_FIXED]    = GEN6_FORMAT_R32G32B32A32_SFIXED,
+      [PIPE_FORMAT_R16_FLOAT]             = GEN6_FORMAT_R16_FLOAT,
+      [PIPE_FORMAT_R16G16_FLOAT]          = GEN6_FORMAT_R16G16_FLOAT,
+      [PIPE_FORMAT_R16G16B16_FLOAT]       = GEN6_FORMAT_R16G16B16_FLOAT,
+      [PIPE_FORMAT_R16G16B16A16_FLOAT]    = GEN6_FORMAT_R16G16B16A16_FLOAT,
+      [PIPE_FORMAT_L8_SRGB]               = GEN6_FORMAT_L8_UNORM_SRGB,
+      [PIPE_FORMAT_L8A8_SRGB]             = GEN6_FORMAT_L8A8_UNORM_SRGB,
+      [PIPE_FORMAT_R8G8B8_SRGB]           = GEN6_FORMAT_R8G8B8_UNORM_SRGB,
+      [PIPE_FORMAT_A8B8G8R8_SRGB]         = 0,
+      [PIPE_FORMAT_X8B8G8R8_SRGB]         = 0,
+      [PIPE_FORMAT_B8G8R8A8_SRGB]         = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB,
+      [PIPE_FORMAT_B8G8R8X8_SRGB]         = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB,
+      [PIPE_FORMAT_A8R8G8B8_SRGB]         = 0,
+      [PIPE_FORMAT_X8R8G8B8_SRGB]         = 0,
+      [PIPE_FORMAT_R8G8B8A8_SRGB]         = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB,
+      [PIPE_FORMAT_DXT1_RGB]              = GEN6_FORMAT_DXT1_RGB,
+      [PIPE_FORMAT_DXT1_RGBA]             = GEN6_FORMAT_BC1_UNORM,
+      [PIPE_FORMAT_DXT3_RGBA]             = GEN6_FORMAT_BC2_UNORM,
+      [PIPE_FORMAT_DXT5_RGBA]             = GEN6_FORMAT_BC3_UNORM,
+      [PIPE_FORMAT_DXT1_SRGB]             = GEN6_FORMAT_DXT1_RGB_SRGB,
+      [PIPE_FORMAT_DXT1_SRGBA]            = GEN6_FORMAT_BC1_UNORM_SRGB,
+      [PIPE_FORMAT_DXT3_SRGBA]            = GEN6_FORMAT_BC2_UNORM_SRGB,
+      [PIPE_FORMAT_DXT5_SRGBA]            = GEN6_FORMAT_BC3_UNORM_SRGB,
+      [PIPE_FORMAT_RGTC1_UNORM]           = GEN6_FORMAT_BC4_UNORM,
+      [PIPE_FORMAT_RGTC1_SNORM]           = GEN6_FORMAT_BC4_SNORM,
+      [PIPE_FORMAT_RGTC2_UNORM]           = GEN6_FORMAT_BC5_UNORM,
+      [PIPE_FORMAT_RGTC2_SNORM]           = GEN6_FORMAT_BC5_SNORM,
+      [PIPE_FORMAT_R8G8_B8G8_UNORM]       = 0,
+      [PIPE_FORMAT_G8R8_G8B8_UNORM]       = 0,
+      [PIPE_FORMAT_R8SG8SB8UX8U_NORM]     = 0,
+      [PIPE_FORMAT_R5SG5SB6U_NORM]        = 0,
+      [PIPE_FORMAT_A8B8G8R8_UNORM]        = 0,
+      [PIPE_FORMAT_B5G5R5X1_UNORM]        = GEN6_FORMAT_B5G5R5X1_UNORM,
+      [PIPE_FORMAT_R10G10B10A2_USCALED]   = GEN6_FORMAT_R10G10B10A2_USCALED,
+      [PIPE_FORMAT_R11G11B10_FLOAT]       = GEN6_FORMAT_R11G11B10_FLOAT,
+      [PIPE_FORMAT_R9G9B9E5_FLOAT]        = GEN6_FORMAT_R9G9B9E5_SHAREDEXP,
+      [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT]  = 0,
+      [PIPE_FORMAT_R1_UNORM]              = GEN6_FORMAT_R1_UNORM,
+      [PIPE_FORMAT_R10G10B10X2_USCALED]   = GEN6_FORMAT_R10G10B10X2_USCALED,
+      [PIPE_FORMAT_R10G10B10X2_SNORM]     = 0,
+      [PIPE_FORMAT_L4A4_UNORM]            = 0,
+      [PIPE_FORMAT_B10G10R10A2_UNORM]     = GEN6_FORMAT_B10G10R10A2_UNORM,
+      [PIPE_FORMAT_R10SG10SB10SA2U_NORM]  = 0,
+      [PIPE_FORMAT_R8G8Bx_SNORM]          = 0,
+      [PIPE_FORMAT_R8G8B8X8_UNORM]        = GEN6_FORMAT_R8G8B8X8_UNORM,
+      [PIPE_FORMAT_B4G4R4X4_UNORM]        = 0,
+      [PIPE_FORMAT_X24S8_UINT]            = 0,
+      [PIPE_FORMAT_S8X24_UINT]            = 0,
+      [PIPE_FORMAT_X32_S8X24_UINT]        = 0,
+      [PIPE_FORMAT_B2G3R3_UNORM]          = 0,
+      [PIPE_FORMAT_L16A16_UNORM]          = GEN6_FORMAT_L16A16_UNORM,
+      [PIPE_FORMAT_A16_UNORM]             = GEN6_FORMAT_A16_UNORM,
+      [PIPE_FORMAT_I16_UNORM]             = GEN6_FORMAT_I16_UNORM,
+      [PIPE_FORMAT_LATC1_UNORM]           = 0,
+      [PIPE_FORMAT_LATC1_SNORM]           = 0,
+      [PIPE_FORMAT_LATC2_UNORM]           = 0,
+      [PIPE_FORMAT_LATC2_SNORM]           = 0,
+      [PIPE_FORMAT_A8_SNORM]              = 0,
+      [PIPE_FORMAT_L8_SNORM]              = 0,
+      [PIPE_FORMAT_L8A8_SNORM]            = 0,
+      [PIPE_FORMAT_I8_SNORM]              = 0,
+      [PIPE_FORMAT_A16_SNORM]             = 0,
+      [PIPE_FORMAT_L16_SNORM]             = 0,
+      [PIPE_FORMAT_L16A16_SNORM]          = 0,
+      [PIPE_FORMAT_I16_SNORM]             = 0,
+      [PIPE_FORMAT_A16_FLOAT]             = GEN6_FORMAT_A16_FLOAT,
+      [PIPE_FORMAT_L16_FLOAT]             = GEN6_FORMAT_L16_FLOAT,
+      [PIPE_FORMAT_L16A16_FLOAT]          = GEN6_FORMAT_L16A16_FLOAT,
+      [PIPE_FORMAT_I16_FLOAT]             = GEN6_FORMAT_I16_FLOAT,
+      [PIPE_FORMAT_A32_FLOAT]             = GEN6_FORMAT_A32_FLOAT,
+      [PIPE_FORMAT_L32_FLOAT]             = GEN6_FORMAT_L32_FLOAT,
+      [PIPE_FORMAT_L32A32_FLOAT]          = GEN6_FORMAT_L32A32_FLOAT,
+      [PIPE_FORMAT_I32_FLOAT]             = GEN6_FORMAT_I32_FLOAT,
+      [PIPE_FORMAT_YV12]                  = 0,
+      [PIPE_FORMAT_YV16]                  = 0,
+      [PIPE_FORMAT_IYUV]                  = 0,
+      [PIPE_FORMAT_NV12]                  = 0,
+      [PIPE_FORMAT_NV21]                  = 0,
+      [PIPE_FORMAT_A4R4_UNORM]            = 0,
+      [PIPE_FORMAT_R4A4_UNORM]            = 0,
+      [PIPE_FORMAT_R8A8_UNORM]            = 0,
+      [PIPE_FORMAT_A8R8_UNORM]            = 0,
+      [PIPE_FORMAT_R10G10B10A2_SSCALED]   = GEN6_FORMAT_R10G10B10A2_SSCALED,
+      [PIPE_FORMAT_R10G10B10A2_SNORM]     = GEN6_FORMAT_R10G10B10A2_SNORM,
+      [PIPE_FORMAT_B10G10R10A2_USCALED]   = GEN6_FORMAT_B10G10R10A2_USCALED,
+      [PIPE_FORMAT_B10G10R10A2_SSCALED]   = GEN6_FORMAT_B10G10R10A2_SSCALED,
+      [PIPE_FORMAT_B10G10R10A2_SNORM]     = GEN6_FORMAT_B10G10R10A2_SNORM,
+      [PIPE_FORMAT_R8_UINT]               = GEN6_FORMAT_R8_UINT,
+      [PIPE_FORMAT_R8G8_UINT]             = GEN6_FORMAT_R8G8_UINT,
+      [PIPE_FORMAT_R8G8B8_UINT]           = GEN6_FORMAT_R8G8B8_UINT,
+      [PIPE_FORMAT_R8G8B8A8_UINT]         = GEN6_FORMAT_R8G8B8A8_UINT,
+      [PIPE_FORMAT_R8_SINT]               = GEN6_FORMAT_R8_SINT,
+      [PIPE_FORMAT_R8G8_SINT]             = GEN6_FORMAT_R8G8_SINT,
+      [PIPE_FORMAT_R8G8B8_SINT]           = GEN6_FORMAT_R8G8B8_SINT,
+      [PIPE_FORMAT_R8G8B8A8_SINT]         = GEN6_FORMAT_R8G8B8A8_SINT,
+      [PIPE_FORMAT_R16_UINT]              = GEN6_FORMAT_R16_UINT,
+      [PIPE_FORMAT_R16G16_UINT]           = GEN6_FORMAT_R16G16_UINT,
+      [PIPE_FORMAT_R16G16B16_UINT]        = GEN6_FORMAT_R16G16B16_UINT,
+      [PIPE_FORMAT_R16G16B16A16_UINT]     = GEN6_FORMAT_R16G16B16A16_UINT,
+      [PIPE_FORMAT_R16_SINT]              = GEN6_FORMAT_R16_SINT,
+      [PIPE_FORMAT_R16G16_SINT]           = GEN6_FORMAT_R16G16_SINT,
+      [PIPE_FORMAT_R16G16B16_SINT]        = GEN6_FORMAT_R16G16B16_SINT,
+      [PIPE_FORMAT_R16G16B16A16_SINT]     = GEN6_FORMAT_R16G16B16A16_SINT,
+      [PIPE_FORMAT_R32_UINT]              = GEN6_FORMAT_R32_UINT,
+      [PIPE_FORMAT_R32G32_UINT]           = GEN6_FORMAT_R32G32_UINT,
+      [PIPE_FORMAT_R32G32B32_UINT]        = GEN6_FORMAT_R32G32B32_UINT,
+      [PIPE_FORMAT_R32G32B32A32_UINT]     = GEN6_FORMAT_R32G32B32A32_UINT,
+      [PIPE_FORMAT_R32_SINT]              = GEN6_FORMAT_R32_SINT,
+      [PIPE_FORMAT_R32G32_SINT]           = GEN6_FORMAT_R32G32_SINT,
+      [PIPE_FORMAT_R32G32B32_SINT]        = GEN6_FORMAT_R32G32B32_SINT,
+      [PIPE_FORMAT_R32G32B32A32_SINT]     = GEN6_FORMAT_R32G32B32A32_SINT,
+      [PIPE_FORMAT_A8_UINT]               = 0,
+      [PIPE_FORMAT_I8_UINT]               = GEN6_FORMAT_I8_UINT,
+      [PIPE_FORMAT_L8_UINT]               = GEN6_FORMAT_L8_UINT,
+      [PIPE_FORMAT_L8A8_UINT]             = GEN6_FORMAT_L8A8_UINT,
+      [PIPE_FORMAT_A8_SINT]               = 0,
+      [PIPE_FORMAT_I8_SINT]               = GEN6_FORMAT_I8_SINT,
+      [PIPE_FORMAT_L8_SINT]               = GEN6_FORMAT_L8_SINT,
+      [PIPE_FORMAT_L8A8_SINT]             = GEN6_FORMAT_L8A8_SINT,
+      [PIPE_FORMAT_A16_UINT]              = 0,
+      [PIPE_FORMAT_I16_UINT]              = 0,
+      [PIPE_FORMAT_L16_UINT]              = 0,
+      [PIPE_FORMAT_L16A16_UINT]           = 0,
+      [PIPE_FORMAT_A16_SINT]              = 0,
+      [PIPE_FORMAT_I16_SINT]              = 0,
+      [PIPE_FORMAT_L16_SINT]              = 0,
+      [PIPE_FORMAT_L16A16_SINT]           = 0,
+      [PIPE_FORMAT_A32_UINT]              = 0,
+      [PIPE_FORMAT_I32_UINT]              = 0,
+      [PIPE_FORMAT_L32_UINT]              = 0,
+      [PIPE_FORMAT_L32A32_UINT]           = 0,
+      [PIPE_FORMAT_A32_SINT]              = 0,
+      [PIPE_FORMAT_I32_SINT]              = 0,
+      [PIPE_FORMAT_L32_SINT]              = 0,
+      [PIPE_FORMAT_L32A32_SINT]           = 0,
+      [PIPE_FORMAT_B10G10R10A2_UINT]      = GEN6_FORMAT_B10G10R10A2_UINT,
+      [PIPE_FORMAT_ETC1_RGB8]             = GEN6_FORMAT_ETC1_RGB8,
+      [PIPE_FORMAT_R8G8_R8B8_UNORM]       = 0,
+      [PIPE_FORMAT_G8R8_B8R8_UNORM]       = 0,
+      [PIPE_FORMAT_R8G8B8X8_SNORM]        = 0,
+      [PIPE_FORMAT_R8G8B8X8_SRGB]         = 0,
+      [PIPE_FORMAT_R8G8B8X8_UINT]         = 0,
+      [PIPE_FORMAT_R8G8B8X8_SINT]         = 0,
+      [PIPE_FORMAT_B10G10R10X2_UNORM]     = GEN6_FORMAT_B10G10R10X2_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_UNORM]    = GEN6_FORMAT_R16G16B16X16_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_SNORM]    = 0,
+      [PIPE_FORMAT_R16G16B16X16_FLOAT]    = GEN6_FORMAT_R16G16B16X16_FLOAT,
+      [PIPE_FORMAT_R16G16B16X16_UINT]     = 0,
+      [PIPE_FORMAT_R16G16B16X16_SINT]     = 0,
+      [PIPE_FORMAT_R32G32B32X32_FLOAT]    = GEN6_FORMAT_R32G32B32X32_FLOAT,
+      [PIPE_FORMAT_R32G32B32X32_UINT]     = 0,
+      [PIPE_FORMAT_R32G32B32X32_SINT]     = 0,
+      [PIPE_FORMAT_R8A8_SNORM]            = 0,
+      [PIPE_FORMAT_R16A16_UNORM]          = 0,
+      [PIPE_FORMAT_R16A16_SNORM]          = 0,
+      [PIPE_FORMAT_R16A16_FLOAT]          = 0,
+      [PIPE_FORMAT_R32A32_FLOAT]          = 0,
+      [PIPE_FORMAT_R8A8_UINT]             = 0,
+      [PIPE_FORMAT_R8A8_SINT]             = 0,
+      [PIPE_FORMAT_R16A16_UINT]           = 0,
+      [PIPE_FORMAT_R16A16_SINT]           = 0,
+      [PIPE_FORMAT_R32A32_UINT]           = 0,
+      [PIPE_FORMAT_R32A32_SINT]           = 0,
+      [PIPE_FORMAT_R10G10B10A2_UINT]      = GEN6_FORMAT_R10G10B10A2_UINT,
+      [PIPE_FORMAT_B5G6R5_SRGB]           = GEN6_FORMAT_B5G6R5_UNORM_SRGB,
+   };
+   int sfmt = format_mapping[format];
+
+   /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */
+   if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT)
+      sfmt = -1;
+
+   return sfmt;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h
index 6b73ea1dad7..4e955c09c14 100644
--- a/src/gallium/drivers/ilo/core/ilo_format.h
+++ b/src/gallium/drivers/ilo/ilo_format.h
@@ -29,8 +29,8 @@
 #define ILO_FORMAT_H
 
 #include "genhw/genhw.h"
-#include "ilo_core.h"
-#include "ilo_dev.h"
+
+#include "ilo_common.h"
 
 bool
 ilo_format_support_vb(const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/ilo_render.c b/src/gallium/drivers/ilo/ilo_render.c
index f5be3360f05..21f75de11a0 100644
--- a/src/gallium/drivers/ilo/ilo_render.c
+++ b/src/gallium/drivers/ilo/ilo_render.c
@@ -35,76 +35,10 @@
 #include "ilo_query.h"
 #include "ilo_render_gen.h"
 
-/* in S1.3 */
-struct sample_position {
-   int8_t x, y;
-};
-
-static const struct sample_position ilo_sample_pattern_1x[1] = {
-   {  0,  0 },
-};
-
-static const struct sample_position ilo_sample_pattern_2x[2] = {
-   { -4, -4 },
-   {  4,  4 },
-};
-
-static const struct sample_position ilo_sample_pattern_4x[4] = {
-   { -2, -6 },
-   {  6, -2 },
-   { -6,  2 },
-   {  2,  6 },
-};
-
-/* \see brw_multisample_positions_8x */
-static const struct sample_position ilo_sample_pattern_8x[8] = {
-   { -1,  1 },
-   {  1,  5 },
-   {  3, -5 },
-   {  5,  3 },
-   { -7, -1 },
-   { -3, -7 },
-   {  7, -3 },
-   { -5,  7 },
-};
-
-static const struct sample_position ilo_sample_pattern_16x[16] = {
-   {  0,  2 },
-   {  3,  0 },
-   { -3, -2 },
-   { -2, -4 },
-   {  4,  3 },
-   {  5,  1 },
-   {  6, -1 },
-   {  2, -6 },
-   { -4,  5 },
-   { -5, -5 },
-   { -1, -7 },
-   {  7, -3 },
-   { -7,  4 },
-   {  1, -8 },
-   { -6,  6 },
-   { -8,  7 },
-};
-
-static uint8_t
-pack_sample_position(const struct sample_position *pos)
-{
-   return (pos->x + 8) << 4 | (pos->y + 8);
-}
-
-static void
-get_sample_position(const struct sample_position *pos, float *x, float *y)
-{
-   *x = (float) (pos->x + 8) / 16.0f;
-   *y = (float) (pos->y + 8) / 16.0f;
-}
-
 struct ilo_render *
 ilo_render_create(struct ilo_builder *builder)
 {
    struct ilo_render *render;
-   int i;
 
    render = CALLOC_STRUCT(ilo_render);
    if (!render)
@@ -121,29 +55,8 @@ ilo_render_create(struct ilo_builder *builder)
       return NULL;
    }
 
-   /* pack into dwords */
-   render->sample_pattern_1x = pack_sample_position(ilo_sample_pattern_1x);
-   render->sample_pattern_2x =
-      pack_sample_position(&ilo_sample_pattern_2x[1]) << 8 |
-      pack_sample_position(&ilo_sample_pattern_2x[0]);
-   for (i = 0; i < 4; i++) {
-      render->sample_pattern_4x |=
-         pack_sample_position(&ilo_sample_pattern_4x[i]) << (8 * i);
-
-      render->sample_pattern_8x[0] |=
-         pack_sample_position(&ilo_sample_pattern_8x[i]) << (8 * i);
-      render->sample_pattern_8x[1] |=
-         pack_sample_position(&ilo_sample_pattern_8x[i + 4]) << (8 * i);
-
-      render->sample_pattern_16x[0] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i]) << (8 * i);
-      render->sample_pattern_16x[1] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 4]) << (8 * i);
-      render->sample_pattern_16x[2] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 8]) << (8 * i);
-      render->sample_pattern_16x[3] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 12]) << (8 * i);
-   }
+   ilo_state_sample_pattern_init_default(&render->sample_pattern,
+         render->dev);
 
    ilo_render_invalidate_hw(render);
    ilo_render_invalidate_builder(render);
@@ -164,38 +77,13 @@ ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_index,
                                float *x, float *y)
 {
-   const struct sample_position *pattern;
+   uint8_t off_x, off_y;
 
-   switch (sample_count) {
-   case 1:
-      assert(sample_index < Elements(ilo_sample_pattern_1x));
-      pattern = ilo_sample_pattern_1x;
-      break;
-   case 2:
-      assert(sample_index < Elements(ilo_sample_pattern_2x));
-      pattern = ilo_sample_pattern_2x;
-      break;
-   case 4:
-      assert(sample_index < Elements(ilo_sample_pattern_4x));
-      pattern = ilo_sample_pattern_4x;
-      break;
-   case 8:
-      assert(sample_index < Elements(ilo_sample_pattern_8x));
-      pattern = ilo_sample_pattern_8x;
-      break;
-   case 16:
-      assert(sample_index < Elements(ilo_sample_pattern_16x));
-      pattern = ilo_sample_pattern_16x;
-      break;
-   default:
-      assert(!"unknown sample count");
-      *x = 0.5f;
-      *y = 0.5f;
-      return;
-      break;
-   }
+   ilo_state_sample_pattern_get_offset(&render->sample_pattern, render->dev,
+         sample_count, sample_index, &off_x, &off_y);
 
-   get_sample_position(&pattern[sample_index], x, y);
+   *x = (float) off_x / 16.0f;
+   *y = (float) off_y / 16.0f;
 }
 
 void
@@ -446,12 +334,44 @@ draw_session_prepare(struct ilo_render *render,
       render->instruction_bo_changed = true;
 
       session->prim_changed = true;
-      session->primitive_restart_changed = true;
+
+      ilo_state_urb_full_delta(&vec->urb, render->dev, &session->urb_delta);
+      ilo_state_vf_full_delta(&vec->ve->vf, render->dev, &session->vf_delta);
+
+      ilo_state_raster_full_delta(&vec->rasterizer->rs, render->dev,
+            &session->rs_delta);
+
+      ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev,
+            &session->vp_delta);
+
+      ilo_state_cc_full_delta(&vec->blend->cc, render->dev,
+            &session->cc_delta);
    } else {
       session->prim_changed =
          (render->state.reduced_prim != session->reduced_prim);
-      session->primitive_restart_changed =
-         (render->state.primitive_restart != vec->draw->primitive_restart);
+
+      ilo_state_urb_get_delta(&vec->urb, render->dev,
+            &render->state.urb, &session->urb_delta);
+
+      if (vec->dirty & ILO_DIRTY_VE) {
+         ilo_state_vf_full_delta(&vec->ve->vf, render->dev,
+               &session->vf_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_RASTERIZER) {
+         ilo_state_raster_get_delta(&vec->rasterizer->rs, render->dev,
+               &render->state.rs, &session->rs_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_VIEWPORT) {
+         ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev,
+               &session->vp_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_BLEND) {
+         ilo_state_cc_get_delta(&vec->blend->cc, render->dev,
+               &render->state.cc, &session->cc_delta);
+      }
    }
 }
 
@@ -467,7 +387,10 @@ draw_session_end(struct ilo_render *render,
    render->instruction_bo_changed = false;
 
    render->state.reduced_prim = session->reduced_prim;
-   render->state.primitive_restart = vec->draw->primitive_restart;
+
+   render->state.urb = vec->urb;
+   render->state.rs = vec->rasterizer->rs;
+   render->state.cc = vec->blend->cc;
 }
 
 void
diff --git a/src/gallium/drivers/ilo/ilo_render.h b/src/gallium/drivers/ilo/ilo_render.h
index a85b2800fb1..098af73ec9b 100644
--- a/src/gallium/drivers/ilo/ilo_render.h
+++ b/src/gallium/drivers/ilo/ilo_render.h
@@ -43,9 +43,6 @@ ilo_render_create(struct ilo_builder *builder);
 void
 ilo_render_destroy(struct ilo_render *render);
 
-/**
- * Estimate the size of an action.
- */
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_count,
diff --git a/src/gallium/drivers/ilo/ilo_render_dynamic.c b/src/gallium/drivers/ilo/ilo_render_dynamic.c
index ef92b12da83..3b4c80227a6 100644
--- a/src/gallium/drivers/ilo/ilo_render_dynamic.c
+++ b/src/gallium/drivers/ilo/ilo_render_dynamic.c
@@ -30,6 +30,7 @@
 
 #include "ilo_common.h"
 #include "ilo_blitter.h"
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
@@ -42,16 +43,14 @@ gen6_emit_draw_dynamic_viewports(struct ilo_render *r,
 {
    ILO_DEV_ASSERT(r->dev, 6, 6);
 
-   /* SF_VIEWPORT, CLIP_VIEWPORT, and CC_VIEWPORT */
-   if (DIRTY(VIEWPORT)) {
+   /* CLIP_VIEWPORT, SF_VIEWPORT, and CC_VIEWPORT */
+   if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                                   ILO_STATE_VIEWPORT_CC_VIEWPORT)) ||
+       r->state_bo_changed) {
       r->state.CLIP_VIEWPORT = gen6_CLIP_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
+            &vec->viewport.vp);
+      r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder, &vec->viewport.vp);
+      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp);
 
       session->viewport_changed = true;
    }
@@ -65,12 +64,12 @@ gen7_emit_draw_dynamic_viewports(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 7, 8);
 
    /* SF_CLIP_VIEWPORT and CC_VIEWPORT */
-   if (DIRTY(VIEWPORT)) {
+   if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                                   ILO_STATE_VIEWPORT_CC_VIEWPORT)) ||
+       r->state_bo_changed) {
       r->state.SF_CLIP_VIEWPORT = gen7_SF_CLIP_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
+            &vec->viewport.vp);
+      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp);
 
       session->viewport_changed = true;
    }
@@ -84,10 +83,10 @@ gen6_emit_draw_dynamic_scissors(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
    /* SCISSOR_RECT */
-   if (DIRTY(SCISSOR) || DIRTY(VIEWPORT)) {
-      /* there should be as many scissors as there are viewports */
+   if ((session->vp_delta.dirty & ILO_STATE_VIEWPORT_SCISSOR_RECT) ||
+       r->state_bo_changed) {
       r->state.SCISSOR_RECT = gen6_SCISSOR_RECT(r->builder,
-            &vec->scissor, vec->viewport.count);
+            &vec->viewport.vp);
 
       session->scissor_changed = true;
    }
@@ -101,32 +100,30 @@ gen6_emit_draw_dynamic_cc(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
    /* BLEND_STATE */
-   if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA)) {
-      if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
-         r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder,
-               vec->blend, &vec->fb, vec->dsa);
-      } else {
-         r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder,
-               vec->blend, &vec->fb, vec->dsa);
-      }
+   if ((session->cc_delta.dirty & ILO_STATE_CC_BLEND_STATE) ||
+        r->state_bo_changed) {
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder, &vec->blend->cc);
+      else
+         r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder, &vec->blend->cc);
 
       session->blend_changed = true;
    }
 
    /* COLOR_CALC_STATE */
-   if (DIRTY(DSA) || DIRTY(STENCIL_REF) || DIRTY(BLEND_COLOR)) {
+   if ((session->cc_delta.dirty & ILO_STATE_CC_COLOR_CALC_STATE) ||
+       r->state_bo_changed) {
       r->state.COLOR_CALC_STATE =
-         gen6_COLOR_CALC_STATE(r->builder, &vec->stencil_ref,
-               vec->dsa->alpha_ref, &vec->blend_color);
-
+         gen6_COLOR_CALC_STATE(r->builder, &vec->blend->cc);
       session->cc_changed = true;
    }
 
    /* DEPTH_STENCIL_STATE */
-   if (ilo_dev_gen(r->dev) < ILO_GEN(8) && DIRTY(DSA)) {
+   if (ilo_dev_gen(r->dev) < ILO_GEN(8) &&
+       ((session->cc_delta.dirty & ILO_STATE_CC_DEPTH_STENCIL_STATE) ||
+        r->state_bo_changed)) {
       r->state.DEPTH_STENCIL_STATE =
-         gen6_DEPTH_STENCIL_STATE(r->builder, vec->dsa);
-
+         gen6_DEPTH_STENCIL_STATE(r->builder, &vec->blend->cc);
       session->dsa_changed = true;
    }
 }
@@ -137,12 +134,11 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r,
                                 int shader_type,
                                 struct ilo_render_draw_session *session)
 {
-   const struct ilo_sampler_cso * const *samplers =
-      vec->sampler[shader_type].cso;
-   const struct pipe_sampler_view * const *views =
-      (const struct pipe_sampler_view **) vec->view[shader_type].states;
+   const struct ilo_view_cso * const *views =
+      (const struct ilo_view_cso **) vec->view[shader_type].states;
+   struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS];
    uint32_t *sampler_state, *border_color_state;
-   int sampler_count;
+   int sampler_count, i;
    bool emit_border_color = false;
    bool skip = false;
 
@@ -194,16 +190,28 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r,
           sampler_count <= Elements(vec->sampler[shader_type].cso));
 
    if (emit_border_color) {
-      int i;
-
       for (i = 0; i < sampler_count; i++) {
-         border_color_state[i] = (samplers[i]) ?
-            gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0;
+         const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+         border_color_state[i] = (cso) ?
+            gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0;
+      }
+   }
+
+   for (i = 0; i < sampler_count; i++) {
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      if (cso && views[i]) {
+         samplers[i] = cso->sampler;
+         ilo_state_sampler_set_surface(&samplers[i],
+               r->dev, &views[i]->surface);
+      } else {
+         samplers[i] = vec->disabled_sampler;
       }
    }
 
-   *sampler_state = gen6_SAMPLER_STATE(r->builder,
-         samplers, views, border_color_state, sampler_count);
+   *sampler_state = gen6_SAMPLER_STATE(r->builder, samplers,
+         border_color_state, sampler_count);
 }
 
 static void
@@ -234,13 +242,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r,
             const struct ilo_cbuf_state *cbuf =
                &vec->cbuf[PIPE_SHADER_VERTEX];
 
-            if (cbuf0_size <= cbuf->cso[0].user_buffer_size) {
+            if (cbuf0_size <= cbuf->cso[0].info.size) {
                memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size);
             } else {
                memcpy(pcb, cbuf->cso[0].user_buffer,
-                     cbuf->cso[0].user_buffer_size);
-               memset(pcb + cbuf->cso[0].user_buffer_size, 0,
-                     cbuf0_size - cbuf->cso[0].user_buffer_size);
+                     cbuf->cso[0].info.size);
+               memset(pcb + cbuf->cso[0].info.size, 0,
+                     cbuf0_size - cbuf->cso[0].info.size);
             }
 
             pcb += cbuf0_size;
@@ -271,13 +279,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r,
             gen6_push_constant_buffer(r->builder, cbuf0_size, &pcb);
          r->state.wm.PUSH_CONSTANT_BUFFER_size = cbuf0_size;
 
-         if (cbuf0_size <= cbuf->cso[0].user_buffer_size) {
+         if (cbuf0_size <= cbuf->cso[0].info.size) {
             memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size);
          } else {
             memcpy(pcb, cbuf->cso[0].user_buffer,
-                  cbuf->cso[0].user_buffer_size);
-            memset(pcb + cbuf->cso[0].user_buffer_size, 0,
-                  cbuf0_size - cbuf->cso[0].user_buffer_size);
+                  cbuf->cso[0].info.size);
+            memset(pcb + cbuf->cso[0].info.size, 0,
+                  cbuf0_size - cbuf->cso[0].info.size);
          }
 
          session->pcb_fs_changed = true;
@@ -441,18 +449,17 @@ ilo_render_emit_rectlist_dynamic_states(struct ilo_render *render,
 
    if (blitter->uses & ILO_BLITTER_USE_DSA) {
       render->state.DEPTH_STENCIL_STATE =
-         gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->dsa);
+         gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->cc);
    }
 
    if (blitter->uses & ILO_BLITTER_USE_CC) {
       render->state.COLOR_CALC_STATE =
-         gen6_COLOR_CALC_STATE(render->builder, &blitter->cc.stencil_ref,
-               blitter->cc.alpha_ref, &blitter->cc.blend_color);
+         gen6_COLOR_CALC_STATE(render->builder, &blitter->cc);
    }
 
    if (blitter->uses & ILO_BLITTER_USE_VIEWPORT) {
       render->state.CC_VIEWPORT =
-         gen6_CC_VIEWPORT(render->builder, &blitter->viewport, 1);
+         gen6_CC_VIEWPORT(render->builder, &blitter->vp);
    }
 
    assert(ilo_builder_dynamic_used(render->builder) <= dynamic_used +
@@ -466,10 +473,9 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r,
 {
    const unsigned shader_type = PIPE_SHADER_COMPUTE;
    const struct ilo_shader_state *cs = vec->cs;
-   const struct ilo_sampler_cso * const *samplers =
-      vec->sampler[shader_type].cso;
-   const struct pipe_sampler_view * const *views =
-      (const struct pipe_sampler_view **) vec->view[shader_type].states;
+   const struct ilo_view_cso * const *views =
+      (const struct ilo_view_cso **) vec->view[shader_type].states;
+   struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS];
    int sampler_count, i;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
@@ -480,11 +486,25 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r,
           sampler_count <= Elements(vec->sampler[shader_type].cso));
 
    for (i = 0; i < sampler_count; i++) {
-      r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (samplers[i]) ?
-         gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0;
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (cso) ?
+         gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0;
    }
 
-   r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers, views,
+   for (i = 0; i < sampler_count; i++) {
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      if (cso && views[i]) {
+         samplers[i] = cso->sampler;
+         ilo_state_sampler_set_surface(&samplers[i],
+               r->dev, &views[i]->surface);
+      } else {
+         samplers[i] = vec->disabled_sampler;
+      }
+   }
+
+   r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers,
          r->state.cs.SAMPLER_BORDER_COLOR_STATE, sampler_count);
 }
 
@@ -503,20 +523,39 @@ gen6_emit_launch_grid_dynamic_idrt(struct ilo_render *r,
                                    struct ilo_render_launch_grid_session *session)
 {
    const struct ilo_shader_state *cs = vec->cs;
-   struct gen6_idrt_data data;
+   struct ilo_state_compute_interface_info interface;
+   struct ilo_state_compute_info info;
+   uint32_t kernel_offset;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
 
-   memset(&data, 0, sizeof(data));
+   memset(&interface, 0, sizeof(interface));
+
+   interface.sampler_count =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT);
+   interface.surface_count =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT);
+   interface.thread_group_size = session->thread_group_size;
+   interface.slm_size =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE);
+   interface.curbe_read_length = r->state.cs.PUSH_CONSTANT_BUFFER_size;
+
+   memset(&info, 0, sizeof(info));
+   info.data = session->compute_data;
+   info.data_size = sizeof(session->compute_data);
+   info.interfaces = &interface;
+   info.interface_count = 1;
+   info.cv_urb_alloc_size = r->dev->urb_size;
+   info.curbe_alloc_size = r->state.cs.PUSH_CONSTANT_BUFFER_size;
+
+   ilo_state_compute_init(&session->compute, r->dev, &info);
 
-   data.cs = cs;
-   data.sampler_offset = r->state.cs.SAMPLER_STATE;
-   data.binding_table_offset = r->state.cs.BINDING_TABLE_STATE;
+   kernel_offset = ilo_shader_get_kernel_offset(cs);
 
-   data.curbe_size = r->state.cs.PUSH_CONSTANT_BUFFER_size;
-   data.thread_group_size = session->thread_group_size;
+   session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder,
+         &session->compute, &kernel_offset,
+         &r->state.cs.SAMPLER_STATE, &r->state.cs.BINDING_TABLE_STATE);
 
-   session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder, &data, 1);
    session->idrt_size = 32;
 }
 
diff --git a/src/gallium/drivers/ilo/ilo_render_gen.h b/src/gallium/drivers/ilo/ilo_render_gen.h
index acfe8be3088..6b133750043 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen.h
+++ b/src/gallium/drivers/ilo/ilo_render_gen.h
@@ -31,6 +31,7 @@
 #include "core/ilo_builder.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
+#include "core/ilo_state_raster.h"
 
 #include "ilo_common.h"
 #include "ilo_state.h"
@@ -50,11 +51,7 @@ struct ilo_render {
 
    struct intel_bo *workaround_bo;
 
-   uint32_t sample_pattern_1x;
-   uint32_t sample_pattern_2x;
-   uint32_t sample_pattern_4x;
-   uint32_t sample_pattern_8x[2];
-   uint32_t sample_pattern_16x[4];
+   struct ilo_state_sample_pattern sample_pattern;
 
    bool hw_ctx_changed;
 
@@ -85,10 +82,13 @@ struct ilo_render {
        */
       uint32_t deferred_pipe_control_dw1;
 
-      bool primitive_restart;
       int reduced_prim;
       int so_max_vertices;
 
+      struct ilo_state_urb urb;
+      struct ilo_state_raster rs;
+      struct ilo_state_cc cc;
+
       uint32_t SF_VIEWPORT;
       uint32_t CLIP_VIEWPORT;
       uint32_t SF_CLIP_VIEWPORT; /* GEN7+ */
@@ -142,7 +142,12 @@ struct ilo_render_draw_session {
    int reduced_prim;
 
    bool prim_changed;
-   bool primitive_restart_changed;
+
+   struct ilo_state_urb_delta urb_delta;
+   struct ilo_state_vf_delta vf_delta;
+   struct ilo_state_raster_delta rs_delta;
+   struct ilo_state_viewport_delta vp_delta;
+   struct ilo_state_cc_delta cc_delta;
 
    /* dynamic states */
    bool viewport_changed;
@@ -180,6 +185,9 @@ struct ilo_render_launch_grid_session {
 
    uint32_t idrt;
    int idrt_size;
+
+   uint32_t compute_data[6];
+   struct ilo_state_compute compute;
 };
 
 int
@@ -381,8 +389,7 @@ ilo_render_pipe_control(struct ilo_render *r, uint32_t dw1)
  */
 static inline void
 ilo_render_3dprimitive(struct ilo_render *r,
-                       const struct pipe_draw_info *info,
-                       const struct ilo_ib_state *ib)
+                       const struct gen6_3dprimitive_info *info)
 {
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
@@ -391,9 +398,9 @@ ilo_render_3dprimitive(struct ilo_render *r,
 
    /* 3DPRIMITIVE */
    if (ilo_dev_gen(r->dev) >= ILO_GEN(7))
-      gen7_3DPRIMITIVE(r->builder, info, ib);
+      gen7_3DPRIMITIVE(r->builder, info);
    else
-      gen6_3DPRIMITIVE(r->builder, info, ib);
+      gen6_3DPRIMITIVE(r->builder, info);
 
    r->state.current_pipe_control_dw1 = 0;
    assert(!r->state.deferred_pipe_control_dw1);
diff --git a/src/gallium/drivers/ilo/ilo_render_gen6.c b/src/gallium/drivers/ilo/ilo_render_gen6.c
index 47f711e7956..c1f759f3043 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen6.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen6.c
@@ -29,11 +29,11 @@
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_mi.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 #include "util/u_prim.h"
 
 #include "ilo_blitter.h"
 #include "ilo_query.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -330,64 +330,19 @@ gen6_draw_common_urb(struct ilo_render *r,
                      const struct ilo_state_vector *vec,
                      struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_URB */
-   if (DIRTY(VE) || DIRTY(VS) || DIRTY(GS)) {
-      const bool gs_active = (vec->gs || (vec->vs &&
-               ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)));
-      int vs_entry_size, gs_entry_size;
-      int vs_total_size, gs_total_size;
-
-      vs_entry_size = (vec->vs) ?
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0;
+   const bool gs_active = (vec->gs || (vec->vs &&
+            ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)));
 
-      /*
-       * As indicated by 2e712e41db0c0676e9f30fc73172c0e8de8d84d4, VF and VS
-       * share VUE handles.  The VUE allocation size must be large enough to
-       * store either VF outputs (number of VERTEX_ELEMENTs) and VS outputs.
-       *
-       * I am not sure if the PRM explicitly states that VF and VS share VUE
-       * handles.  But here is a citation that implies so:
-       *
-       * From the Sandy Bridge PRM, volume 2 part 1, page 44:
-       *
-       *     "Once a FF stage that spawn threads has sufficient input to
-       *      initiate a thread, it must guarantee that it is safe to request
-       *      the thread initiation. For all these FF stages, this check is
-       *      based on :
-       *
-       *      - The availability of output URB entries:
-       *        - VS: As the input URB entries are overwritten with the
-       *          VS-generated output data, output URB availability isn't a
-       *          factor."
-       */
-      if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso)
-         vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso;
-
-      gs_entry_size = (vec->gs) ?
-         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT) :
-         (gs_active) ? vs_entry_size : 0;
-
-      /* in bytes */
-      vs_entry_size *= sizeof(float) * 4;
-      gs_entry_size *= sizeof(float) * 4;
-      vs_total_size = r->dev->urb_size;
-
-      if (gs_active) {
-         vs_total_size /= 2;
-         gs_total_size = vs_total_size;
-      }
-      else {
-         gs_total_size = 0;
-      }
-
-      gen6_3DSTATE_URB(r->builder, vs_total_size, gs_total_size,
-            vs_entry_size, gs_entry_size);
+   /* 3DSTATE_URB */
+   if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS |
+                                   ILO_STATE_URB_3DSTATE_URB_GS)) {
+      gen6_3DSTATE_URB(r->builder, &vec->urb);
 
       if (r->state.gs.active && !gs_active)
          gen6_wa_post_3dstate_urb_no_gs(r);
-
-      r->state.gs.active = gs_active;
    }
+
+   r->state.gs.active = gs_active;
 }
 
 static void
@@ -459,33 +414,30 @@ gen6_draw_vf(struct ilo_render *r,
 {
    if (ilo_dev_gen(r->dev) >= ILO_GEN(7.5)) {
       /* 3DSTATE_INDEX_BUFFER */
-      if (DIRTY(IB) || r->batch_bo_changed) {
-         gen6_3DSTATE_INDEX_BUFFER(r->builder,
-               &vec->ib, false);
-      }
+      if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+          DIRTY(IB) || r->batch_bo_changed)
+         gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
 
       /* 3DSTATE_VF */
-      if (session->primitive_restart_changed) {
-         gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart,
-               vec->draw->restart_index);
-      }
-   }
-   else {
+      if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF)
+         gen75_3DSTATE_VF(r->builder, &vec->ve->vf);
+   } else {
       /* 3DSTATE_INDEX_BUFFER */
-      if (DIRTY(IB) || session->primitive_restart_changed ||
-          r->batch_bo_changed) {
-         gen6_3DSTATE_INDEX_BUFFER(r->builder,
-               &vec->ib, vec->draw->primitive_restart);
-      }
+      if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+          DIRTY(IB) || r->batch_bo_changed)
+         gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
    }
 
    /* 3DSTATE_VERTEX_BUFFERS */
-   if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed)
-      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) ||
+       DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) {
+      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf,
+            vec->vb.vb, vec->ve->vb_count);
+   }
 
    /* 3DSTATE_VERTEX_ELEMENTS */
-   if (DIRTY(VE))
-      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS)
+      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf);
 }
 
 void
@@ -516,10 +468,17 @@ gen6_draw_vs(struct ilo_render *r,
 
    /* 3DSTATE_VS */
    if (DIRTY(VS) || r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);
+
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_3dstate_vs_toggle(r);
 
-      gen6_3DSTATE_VS(r->builder, vec->vs);
+      if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))
+         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs, kernel_offset);
+      else
+         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
    }
 }
 
@@ -535,14 +494,39 @@ gen6_draw_gs(struct ilo_render *r,
    /* 3DSTATE_GS */
    if (DIRTY(GS) || DIRTY(VS) ||
        session->prim_changed || r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso;
+      uint32_t kernel_offset;
+
       if (vec->gs) {
-         gen6_3DSTATE_GS(r->builder, vec->gs);
-      } else if (vec->vs &&
+         cso = ilo_shader_get_kernel_cso(vec->gs);
+         kernel_offset = ilo_shader_get_kernel_offset(vec->gs);
+
+         gen6_3DSTATE_GS(r->builder, &cso->gs, kernel_offset);
+      } else if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
             ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
-         const int verts_per_prim = u_vertices_per_prim(session->reduced_prim);
-         gen6_so_3DSTATE_GS(r->builder, vec->vs, verts_per_prim);
+         const int verts_per_prim =
+            u_vertices_per_prim(session->reduced_prim);
+         enum ilo_kernel_param param;
+
+         switch (verts_per_prim) {
+         case 1:
+            param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET;
+            break;
+         case 2:
+            param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET;
+            break;
+         default:
+            param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET;
+            break;
+         }
+
+         cso = ilo_shader_get_kernel_cso(vec->vs);
+         kernel_offset = ilo_shader_get_kernel_offset(vec->vs) +
+            ilo_shader_get_kernel_param(vec->vs, param);
+
+         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol, kernel_offset);
       } else {
-         gen6_disable_3DSTATE_GS(r->builder);
+         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0);
       }
    }
 }
@@ -633,30 +617,8 @@ gen6_draw_clip(struct ilo_render *r,
                struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_CLIP */
-   if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(VIEWPORT) || DIRTY(FB)) {
-      bool enable_guardband = true;
-      unsigned i;
-
-      /*
-       * Gen8+ has viewport extent test.  Guard band test can be enabled on
-       * prior Gens only when the viewport is larger than the framebuffer,
-       * unless we emulate viewport extent test on them.
-       */
-      if (ilo_dev_gen(r->dev) < ILO_GEN(8)) {
-         for (i = 0; i < vec->viewport.count; i++) {
-            const struct ilo_viewport_cso *vp = &vec->viewport.cso[i];
-
-            if (vp->min_x > 0.0f || vp->max_x < vec->fb.state.width ||
-                vp->min_y > 0.0f || vp->max_y < vec->fb.state.height) {
-               enable_guardband = false;
-               break;
-            }
-         }
-      }
-
-      gen6_3DSTATE_CLIP(r->builder, vec->rasterizer,
-            vec->fs, enable_guardband, 1);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_CLIP)
+      gen6_3DSTATE_CLIP(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -665,9 +627,9 @@ gen6_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(FB)) {
-      gen6_3DSTATE_SF(r->builder, vec->rasterizer, vec->fs,
-            vec->fb.num_samples);
+   if ((session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) || DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
+      gen6_3DSTATE_SF(r->builder, &vec->rasterizer->rs, sbe);
    }
 }
 
@@ -700,17 +662,17 @@ gen6_draw_wm(struct ilo_render *r,
    }
 
    /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) ||
-       DIRTY(RASTERIZER) || r->instruction_bo_changed) {
-      const bool dual_blend = vec->blend->dual_blend;
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
+   if (DIRTY(FS) ||
+       (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM) ||
+       r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(6) && r->hw_ctx_changed)
          gen6_wa_pre_3dstate_wm_max_threads(r);
 
-      gen6_3DSTATE_WM(r->builder, vec->fs,
-            vec->rasterizer, dual_blend, cc_may_kill);
+      gen6_3DSTATE_WM(r->builder, &vec->rasterizer->rs,
+            &cso->ps, kernel_offset);
    }
 }
 
@@ -719,25 +681,23 @@ gen6_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) {
-      const uint32_t *pattern;
-
-      pattern = (vec->fb.num_samples > 1) ?
-         &r->sample_pattern_4x : &r->sample_pattern_1x;
+   /* 3DSTATE_MULTISAMPLE */
+   if (DIRTY(FB) || (session->rs_delta.dirty &
+            ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) {
+      const uint8_t sample_count = (vec->fb.num_samples > 1) ? 4 : 1;
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(6)) {
          gen6_wa_pre_non_pipelined(r);
          gen6_wa_pre_3dstate_multisample(r);
       }
 
-      gen6_3DSTATE_MULTISAMPLE(r->builder,
-            vec->fb.num_samples, pattern,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen6_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1);
+      gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs,
+            &r->sample_pattern, sample_count);
    }
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -747,7 +707,7 @@ gen6_draw_wm_depth(struct ilo_render *r,
 {
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -772,7 +732,7 @@ gen6_draw_wm_depth(struct ilo_render *r,
          gen6_wa_pre_depth(r);
       }
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen6_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -790,10 +750,8 @@ gen6_draw_wm_raster(struct ilo_render *r,
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder,
-            &vec->poly_stipple);
-
-      gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, 0, 0);
+      gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder, &vec->poly_stipple);
+      gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, &vec->poly_stipple);
    }
 
    /* 3DSTATE_LINE_STIPPLE */
@@ -801,17 +759,16 @@ gen6_draw_wm_raster(struct ilo_render *r,
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_LINE_STIPPLE(r->builder,
-            vec->rasterizer->state.line_stipple_pattern,
-            vec->rasterizer->state.line_stipple_factor + 1);
+      gen6_3DSTATE_LINE_STIPPLE(r->builder, &vec->line_stipple);
    }
 
    /* 3DSTATE_AA_LINE_PARAMETERS */
-   if (DIRTY(RASTERIZER) && vec->rasterizer->state.line_smooth) {
+   if (session->rs_delta.dirty &
+         ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder);
+      gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder, &vec->rasterizer->rs);
    }
 }
 
@@ -849,7 +806,7 @@ ilo_render_emit_draw_commands_gen6(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen6_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 static void
@@ -860,40 +817,23 @@ gen6_rectlist_vs_to_sf(struct ilo_render *r,
    gen6_wa_post_3dstate_constant_vs(r);
 
    gen6_wa_pre_3dstate_vs_toggle(r);
-   gen6_disable_3DSTATE_VS(r->builder);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
 
    gen6_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen6_disable_3DSTATE_GS(r->builder);
+   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0);
 
-   gen6_disable_3DSTATE_CLIP(r->builder);
-   gen6_3DSTATE_SF(r->builder, NULL, NULL, blitter->fb.num_samples);
+   gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
+   gen6_3DSTATE_SF(r->builder, &blitter->fb.rs, &blitter->sbe);
 }
 
 static void
 gen6_rectlist_wm(struct ilo_render *r,
                  const struct ilo_blitter *blitter)
 {
-   uint32_t hiz_op;
-
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      hiz_op = GEN6_WM_DW4_DEPTH_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      hiz_op = GEN6_WM_DW4_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      hiz_op = GEN6_WM_DW4_HIZ_RESOLVE;
-      break;
-   default:
-      hiz_op = 0;
-      break;
-   }
-
    gen6_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen6_wa_pre_3dstate_wm_max_threads(r);
-   gen6_hiz_3DSTATE_WM(r->builder, hiz_op);
+   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0);
 }
 
 static void
@@ -903,10 +843,8 @@ gen6_rectlist_wm_depth(struct ilo_render *r,
    gen6_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -926,16 +864,12 @@ static void
 gen6_rectlist_wm_multisample(struct ilo_render *r,
                              const struct ilo_blitter *blitter)
 {
-   const uint32_t *pattern = (blitter->fb.num_samples > 1) ?
-      &r->sample_pattern_4x : &r->sample_pattern_1x;
+   const uint8_t sample_count = (blitter->fb.num_samples > 1) ? 4 : 1;
 
    gen6_wa_pre_3dstate_multisample(r);
 
-   gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples,
-         pattern, true);
-
-   gen6_3DSTATE_SAMPLE_MASK(r->builder,
-         (1 << blitter->fb.num_samples) - 1);
+   gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs, &r->sample_pattern, sample_count);
+   gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs);
 }
 
 int
@@ -964,11 +898,9 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r,
          session->vb_start, session->vb_end,
          sizeof(blitter->vertices[0]));
 
-   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve);
+   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf);
 
-   gen6_3DSTATE_URB(r->builder, r->dev->urb_size, 0,
-         (blitter->ve.count + blitter->ve.prepend_nosrc_cso) * 4 * sizeof(float),
-         0);
+   gen6_3DSTATE_URB(r->builder, &blitter->urb);
 
    if (r->state.gs.active) {
       gen6_wa_post_3dstate_urb_no_gs(r);
@@ -994,7 +926,7 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r,
    gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0,
          blitter->fb.width, blitter->fb.height);
 
-   ilo_render_3dprimitive(r, &blitter->draw, NULL);
+   ilo_render_3dprimitive(r, &blitter->draw_info);
 }
 
 int
diff --git a/src/gallium/drivers/ilo/ilo_render_gen7.c b/src/gallium/drivers/ilo/ilo_render_gen7.c
index 07fe7c83536..6623a8bcb43 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen7.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen7.c
@@ -28,9 +28,9 @@
 #include "genhw/genhw.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -201,40 +201,17 @@ gen7_draw_common_urb(struct ilo_render *r,
                      struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_URB_{VS,GS,HS,DS} */
-   if (DIRTY(VE) || DIRTY(VS)) {
-      /* the first 16KB are reserved for VS and PS PCBs */
-      const int offset =
-         (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-          (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-          32768 : 16384;
-      int vs_entry_size, vs_total_size;
-
-      vs_entry_size = (vec->vs) ?
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0;
-
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 35:
-       *
-       *     "Programming Restriction: As the VS URB entry serves as both the
-       *      per-vertex input and output of the VS shader, the VS URB
-       *      Allocation Size must be sized to the maximum of the vertex input
-       *      and output structures."
-       */
-      if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso)
-         vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso;
-
-      vs_entry_size *= sizeof(float) * 4;
-      vs_total_size = r->dev->urb_size - offset;
-
+   if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS |
+                                   ILO_STATE_URB_3DSTATE_URB_HS |
+                                   ILO_STATE_URB_3DSTATE_URB_DS |
+                                   ILO_STATE_URB_3DSTATE_URB_GS)) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_pre_vs(r);
 
-      gen7_3DSTATE_URB_VS(r->builder,
-            offset, vs_total_size, vs_entry_size);
-
-      gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0);
-      gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0);
-      gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0);
+      gen7_3DSTATE_URB_VS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_GS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_HS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_DS(r->builder, &vec->urb);
    }
 }
 
@@ -244,22 +221,15 @@ gen7_draw_common_pcb_alloc(struct ilo_render *r,
                            struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_PUSH_CONSTANT_ALLOC_{VS,PS} */
-   if (r->hw_ctx_changed) {
-      /*
-       * Push constant buffers are only allowed to take up at most the first
-       * 16KB of the URB.  Split the space evenly for VS and FS.
-       */
-      const int max_size =
-         (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-          (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-          32768 : 16384;
-      const int size = max_size / 2;
-      int offset = 0;
-
-      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size);
-      offset += size;
-
-      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size);
+   if (session->urb_delta.dirty &
+         (ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS)) {
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &vec->urb);
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(r->builder, &vec->urb);
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &vec->urb);
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_post_3dstate_push_constant_alloc_ps(r);
@@ -344,14 +314,14 @@ gen7_draw_vs(struct ilo_render *r,
    }
 
    /* 3DSTATE_VS */
-   if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
-      if (emit_3dstate_vs || DIRTY(RASTERIZER)) {
-         gen8_3DSTATE_VS(r->builder, vec->vs,
-               vec->rasterizer->state.clip_plane_enable);
-      }
-   } else {
-      if (emit_3dstate_vs)
-         gen6_3DSTATE_VS(r->builder, vec->vs);
+   if (emit_3dstate_vs) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+      else
+         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
    }
 }
 
@@ -362,8 +332,15 @@ gen7_draw_hs(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_HS and 3DSTATE_HS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_hs *hs = &vec->disabled_hs;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_HS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_HS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_HS(r->builder, hs, kernel_offset);
+      else
+         gen7_3DSTATE_HS(r->builder, hs, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_HS */
@@ -377,8 +354,10 @@ gen7_draw_te(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_TE */
-   if (r->hw_ctx_changed)
-      gen7_3DSTATE_TE(r->builder);
+   if (r->hw_ctx_changed) {
+      const struct ilo_state_ds *ds = &vec->disabled_ds;
+      gen7_3DSTATE_TE(r->builder, ds);
+   }
 }
 
 void
@@ -388,8 +367,15 @@ gen7_draw_ds(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_DS and 3DSTATE_DS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_ds *ds = &vec->disabled_ds;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_DS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_DS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_DS(r->builder, ds, kernel_offset);
+      else
+         gen7_3DSTATE_DS(r->builder, ds, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_DS */
@@ -405,8 +391,15 @@ gen7_draw_gs(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_GS and 3DSTATE_GS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_gs *gs = &vec->disabled_gs;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_GS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_GS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_GS(r->builder, gs, kernel_offset);
+      else
+         gen7_3DSTATE_GS(r->builder, gs, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_GS */
@@ -421,7 +414,7 @@ gen7_draw_sol(struct ilo_render *r,
               const struct ilo_state_vector *vec,
               struct ilo_render_draw_session *session)
 {
-   const struct pipe_stream_output_info *so_info;
+   const struct ilo_state_sol *sol;
    const struct ilo_shader_state *shader;
    bool dirty_sh = false;
 
@@ -434,41 +427,54 @@ gen7_draw_sol(struct ilo_render *r,
       dirty_sh = DIRTY(VS);
    }
 
-   so_info = ilo_shader_get_kernel_so_info(shader);
+   sol = ilo_shader_get_kernel_sol(shader);
 
    /* 3DSTATE_SO_BUFFER */
    if ((DIRTY(SO) || dirty_sh || r->batch_bo_changed) &&
        vec->so.enabled) {
       int i;
 
-      for (i = 0; i < vec->so.count; i++) {
-         const int stride = so_info->stride[i] * 4; /* in bytes */
-
-         gen7_3DSTATE_SO_BUFFER(r->builder, i, stride, vec->so.states[i]);
+      for (i = 0; i < ILO_STATE_SOL_MAX_BUFFER_COUNT; i++) {
+         const struct pipe_stream_output_target *target =
+            (i < vec->so.count && vec->so.states[i]) ?
+            vec->so.states[i] : NULL;
+         const struct ilo_state_sol_buffer *sb = (target) ?
+            &((const struct ilo_stream_output_target *) target)->sb :
+            &vec->so.dummy_sb;
+
+         if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+            gen8_3DSTATE_SO_BUFFER(r->builder, sol, sb, i);
+         else
+            gen7_3DSTATE_SO_BUFFER(r->builder, sol, sb, i);
       }
-
-      for (; i < 4; i++)
-         gen7_disable_3DSTATE_SO_BUFFER(r->builder, i);
    }
 
    /* 3DSTATE_SO_DECL_LIST */
    if (dirty_sh && vec->so.enabled)
-      gen7_3DSTATE_SO_DECL_LIST(r->builder, so_info);
-
-   /* 3DSTATE_STREAMOUT */
-   if (DIRTY(SO) || DIRTY(RASTERIZER) || dirty_sh) {
-      const int output_count = ilo_shader_get_kernel_param(shader,
-            ILO_KERNEL_OUTPUT_COUNT);
-      int buf_strides[4] = { 0, 0, 0, 0 };
-      int i;
+      gen7_3DSTATE_SO_DECL_LIST(r->builder, sol);
 
-      for (i = 0; i < vec->so.count; i++)
-         buf_strides[i] = so_info->stride[i] * 4;
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 196-197:
+    *
+    *     "Anytime the SOL unit MMIO registers or non-pipeline state are
+    *      written, the SOL unit needs to receive a pipeline state update with
+    *      SOL unit dirty state for information programmed in MMIO/NP to get
+    *      loaded into the SOL unit.
+    *
+    *      The SOL unit incorrectly double buffers MMIO/NP registers and only
+    *      moves them into the design for usage when control topology is
+    *      received with the SOL unit dirty state.
+    *
+    *      If the state does not change, need to resend the same state.
+    *
+    *      Because of corruption, software must flush the whole fixed function
+    *      pipeline when 3DSTATE_STREAMOUT changes state."
+    *
+    * The first and fourth paragraphs are gone on Gen7.5+.
+    */
 
-      gen7_3DSTATE_STREAMOUT(r->builder, 0,
-            vec->rasterizer->state.rasterizer_discard,
-            output_count, buf_strides);
-   }
+   /* 3DSTATE_STREAMOUT */
+   gen7_3DSTATE_STREAMOUT(r->builder, sol);
 }
 
 static void
@@ -477,22 +483,17 @@ gen7_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SBE */
-   if (DIRTY(RASTERIZER) || DIRTY(FS)) {
-      gen7_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ?
-            vec->rasterizer->state.sprite_coord_mode : 0);
+   if (DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
+      gen7_3DSTATE_SBE(r->builder, sbe);
    }
 
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER) || DIRTY(FB)) {
-      struct pipe_surface *zs = vec->fb.state.zsbuf;
-
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_pre_3dstate_sf_depth_bias(r);
 
-      gen7_3DSTATE_SF(r->builder,
-            (vec->rasterizer) ? &vec->rasterizer->sf : NULL,
-            (zs) ? zs->format : PIPE_FORMAT_NONE,
-            vec->fb.num_samples);
+      gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs);
    }
 }
 
@@ -501,13 +502,12 @@ gen7_draw_wm(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) || DIRTY(RASTERIZER)) {
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
+   const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+   const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
 
-      gen7_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer, cc_may_kill);
-   }
+   /* 3DSTATE_WM */
+   if (DIRTY(FS) || (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM))
+      gen7_3DSTATE_WM(r->builder, &vec->rasterizer->rs, &cso->ps);
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_PS */
    if (session->binding_table_fs_changed) {
@@ -530,13 +530,11 @@ gen7_draw_wm(struct ilo_render *r,
    }
 
    /* 3DSTATE_PS */
-   if (DIRTY(FS) || DIRTY(BLEND) || r->instruction_bo_changed) {
-      const bool dual_blend = vec->blend->dual_blend;
-
+   if (DIRTY(FS) || r->instruction_bo_changed) {
       if (r->hw_ctx_changed)
          gen7_wa_pre_3dstate_ps_max_threads(r);
 
-      gen7_3DSTATE_PS(r->builder, vec->fs, dual_blend);
+      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
    }
 
    /* 3DSTATE_SCISSOR_STATE_POINTERS */
@@ -569,7 +567,7 @@ gen7_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -588,7 +586,7 @@ gen7_draw_wm(struct ilo_render *r,
          clear_params = 0;
       }
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -600,24 +598,21 @@ gen7_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) {
-      const uint32_t *pattern;
+   /* 3DSTATE_MULTISAMPLE */
+   if (DIRTY(FB) || (session->rs_delta.dirty &
+            ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) {
+      const uint8_t sample_count = (vec->fb.num_samples > 4) ? 8 :
+                                   (vec->fb.num_samples > 1) ? 4 : 1;
 
       gen7_wa_pre_3dstate_multisample(r);
 
-      pattern = (vec->fb.num_samples > 4) ? r->sample_pattern_8x :
-                (vec->fb.num_samples > 1) ? &r->sample_pattern_4x :
-                &r->sample_pattern_1x;
-
-      gen6_3DSTATE_MULTISAMPLE(r->builder,
-            vec->fb.num_samples, pattern,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen7_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1,
-            vec->fb.num_samples);
+      gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs,
+            &r->sample_pattern, sample_count);
    }
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 void
@@ -654,28 +649,15 @@ ilo_render_emit_draw_commands_gen7(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen6_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 static void
 gen7_rectlist_pcb_alloc(struct ilo_render *r,
                         const struct ilo_blitter *blitter)
 {
-   /*
-    * Push constant buffers are only allowed to take up at most the first
-    * 16KB of the URB.  Split the space evenly for VS and FS.
-    */
-   const int max_size =
-      (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-       (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-       32768 : 16384;
-   const int size = max_size / 2;
-   int offset = 0;
-
-   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size);
-   offset += size;
-
-   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size);
+   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &blitter->urb);
+   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &blitter->urb);
 
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_post_3dstate_push_constant_alloc_ps(r);
@@ -685,19 +667,10 @@ static void
 gen7_rectlist_urb(struct ilo_render *r,
                   const struct ilo_blitter *blitter)
 {
-   /* the first 16KB are reserved for VS and PS PCBs */
-   const int offset =
-      (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-       (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-       32768 : 16384;
-
-   gen7_3DSTATE_URB_VS(r->builder, offset, r->dev->urb_size - offset,
-         (blitter->ve.count + blitter->ve.prepend_nosrc_cso) *
-         4 * sizeof(float));
-
-   gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0);
-   gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0);
-   gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0);
+   gen7_3DSTATE_URB_VS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_GS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_HS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_DS(r->builder, &blitter->urb);
 }
 
 static void
@@ -705,58 +678,40 @@ gen7_rectlist_vs_to_sf(struct ilo_render *r,
                        const struct ilo_blitter *blitter)
 {
    gen7_3DSTATE_CONSTANT_VS(r->builder, NULL, NULL, 0);
-   gen6_disable_3DSTATE_VS(r->builder);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
 
    gen7_3DSTATE_CONSTANT_HS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_HS(r->builder);
+   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0);
 
-   gen7_3DSTATE_TE(r->builder);
+   gen7_3DSTATE_TE(r->builder, &blitter->ds);
 
    gen7_3DSTATE_CONSTANT_DS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_DS(r->builder);
+   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0);
 
    gen7_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_GS(r->builder);
+   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0);
 
-   gen7_3DSTATE_STREAMOUT(r->builder, 0, false, 0x0, 0);
+   gen7_3DSTATE_STREAMOUT(r->builder, &blitter->sol);
 
-   gen6_disable_3DSTATE_CLIP(r->builder);
+   gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
 
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_pre_3dstate_sf_depth_bias(r);
 
-   gen7_3DSTATE_SF(r->builder, NULL, blitter->fb.dst.base.format,
-         blitter->fb.num_samples);
-   gen7_3DSTATE_SBE(r->builder, NULL, 0);
+   gen7_3DSTATE_SF(r->builder, &blitter->fb.rs);
+   gen7_3DSTATE_SBE(r->builder, &blitter->sbe);
 }
 
 static void
 gen7_rectlist_wm(struct ilo_render *r,
                  const struct ilo_blitter *blitter)
 {
-   uint32_t hiz_op;
-
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      hiz_op = GEN7_WM_DW1_DEPTH_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      hiz_op = GEN7_WM_DW1_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      hiz_op = GEN7_WM_DW1_HIZ_RESOLVE;
-      break;
-   default:
-      hiz_op = 0;
-      break;
-   }
-
-   gen7_hiz_3DSTATE_WM(r->builder, hiz_op);
+   gen7_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps);
 
    gen7_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen7_wa_pre_3dstate_ps_max_threads(r);
-   gen7_disable_3DSTATE_PS(r->builder);
+   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0);
 }
 
 static void
@@ -766,10 +721,8 @@ gen7_rectlist_wm_depth(struct ilo_render *r,
    gen7_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -789,18 +742,15 @@ static void
 gen7_rectlist_wm_multisample(struct ilo_render *r,
                              const struct ilo_blitter *blitter)
 {
-   const uint32_t *pattern =
-      (blitter->fb.num_samples > 4) ? r->sample_pattern_8x :
-      (blitter->fb.num_samples > 1) ? &r->sample_pattern_4x :
-      &r->sample_pattern_1x;
+   const uint8_t sample_count = (blitter->fb.num_samples > 4) ? 8 :
+                                (blitter->fb.num_samples > 1) ? 4 : 1;
 
    gen7_wa_pre_3dstate_multisample(r);
 
-   gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples,
-         pattern, true);
+   gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs,
+         &r->sample_pattern, sample_count);
 
-   gen7_3DSTATE_SAMPLE_MASK(r->builder,
-         (1 << blitter->fb.num_samples) - 1, blitter->fb.num_samples);
+   gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs);
 }
 
 void
@@ -818,7 +768,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r,
          session->vb_start, session->vb_end,
          sizeof(blitter->vertices[0]));
 
-   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve);
+   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf);
 
    gen7_rectlist_pcb_alloc(r, blitter);
 
@@ -854,7 +804,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r,
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_post_ps_and_later(r);
 
-   ilo_render_3dprimitive(r, &blitter->draw, NULL);
+   ilo_render_3dprimitive(r, &blitter->draw_info);
 }
 
 int
diff --git a/src/gallium/drivers/ilo/ilo_render_gen8.c b/src/gallium/drivers/ilo/ilo_render_gen8.c
index 715b93611f1..65494b4058a 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen8.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen8.c
@@ -28,9 +28,9 @@
 #include "genhw/genhw.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -66,26 +66,20 @@ gen8_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_RASTER */
-   if (DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_RASTER(r->builder, (vec->rasterizer) ?
-            &vec->rasterizer->sf : NULL);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_RASTER)
+      gen8_3DSTATE_RASTER(r->builder, &vec->rasterizer->rs);
 
-   /* 3DSTATE_SBE */
-   if (DIRTY(RASTERIZER) || DIRTY(FS)) {
-      gen8_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ?
-            vec->rasterizer->state.sprite_coord_mode : 0);
-   }
+   /* 3DSTATE_SBE and 3DSTATE_SBE_SWIZ */
+   if (DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
 
-   /* 3DSTATE_SBE_SWIZ */
-   if (DIRTY(FS))
-      gen8_3DSTATE_SBE_SWIZ(r->builder, vec->fs);
+      gen8_3DSTATE_SBE(r->builder, sbe);
+      gen8_3DSTATE_SBE_SWIZ(r->builder, sbe);
+   }
 
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_SF(r->builder, (vec->rasterizer) ?
-            &vec->rasterizer->sf : NULL);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF)
+      gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -93,12 +87,15 @@ gen8_draw_wm(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
+   const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+   const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
+
    /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(RASTERIZER))
-      gen8_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer);
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM)
+      gen8_3DSTATE_WM(r->builder, &vec->rasterizer->rs);
 
-   if (DIRTY(DSA))
-      gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, vec->dsa);
+   if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL)
+      gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, &vec->blend->cc);
 
    /* 3DSTATE_WM_HZ_OP and 3DSTATE_WM_CHROMAKEY */
    if (r->hw_ctx_changed) {
@@ -128,18 +125,15 @@ gen8_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_PS */
    if (DIRTY(FS) || r->instruction_bo_changed)
-      gen8_3DSTATE_PS(r->builder, vec->fs);
+      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
 
    /* 3DSTATE_PS_EXTRA */
-   if (DIRTY(FS) || DIRTY(DSA) || DIRTY(BLEND)) {
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
-      gen8_3DSTATE_PS_EXTRA(r->builder, vec->fs, cc_may_kill, false);
-   }
+   if (DIRTY(FS))
+      gen8_3DSTATE_PS_EXTRA(r->builder, &cso->ps);
 
    /* 3DSTATE_PS_BLEND */
-   if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA))
-      gen8_3DSTATE_PS_BLEND(r->builder, vec->blend, &vec->fb, vec->dsa);
+   if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_PS_BLEND)
+      gen8_3DSTATE_PS_BLEND(r->builder, &vec->blend->cc);
 
    /* 3DSTATE_SCISSOR_STATE_POINTERS */
    if (session->scissor_changed) {
@@ -149,7 +143,7 @@ gen8_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -170,7 +164,7 @@ gen8_draw_wm(struct ilo_render *r,
 
       gen8_wa_pre_depth(r);
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -183,14 +177,8 @@ gen8_draw_wm_sample_pattern(struct ilo_render *r,
                             struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SAMPLE_PATTERN */
-   if (r->hw_ctx_changed) {
-      gen8_3DSTATE_SAMPLE_PATTERN(r->builder,
-            &r->sample_pattern_1x,
-            &r->sample_pattern_2x,
-            &r->sample_pattern_4x,
-            r->sample_pattern_8x,
-            r->sample_pattern_16x);
-   }
+   if (r->hw_ctx_changed)
+      gen8_3DSTATE_SAMPLE_PATTERN(r->builder, &r->sample_pattern);
 }
 
 static void
@@ -198,15 +186,13 @@ gen8_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB) || DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_MULTISAMPLE(r->builder, vec->fb.num_samples,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen7_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1,
-            vec->fb.num_samples);
-   }
+   /* 3DSTATE_MULTISAMPLE */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)
+      gen8_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs);
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -214,36 +200,38 @@ gen8_draw_vf(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
-   int i;
-
    /* 3DSTATE_INDEX_BUFFER */
-   if (DIRTY(IB) || r->batch_bo_changed)
-      gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ib);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+       DIRTY(IB) || r->batch_bo_changed)
+      gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
 
    /* 3DSTATE_VF */
-   if (session->primitive_restart_changed) {
-      gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart,
-            vec->draw->restart_index);
-   }
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF)
+      gen75_3DSTATE_VF(r->builder, &vec->ve->vf);
 
    /* 3DSTATE_VERTEX_BUFFERS */
-   if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed)
-      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) ||
+       DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) {
+      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf,
+            vec->vb.vb, vec->ve->vb_count);
+   }
 
    /* 3DSTATE_VERTEX_ELEMENTS */
-   if (DIRTY(VE))
-      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS)
+      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf);
+
+   gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw_info.topology);
 
-   gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw->mode);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_INSTANCING) {
+      const uint8_t attr_count = ilo_state_vf_get_attr_count(&vec->ve->vf);
+      uint8_t i;
 
-   for (i = 0; i < vec->ve->vb_count; i++) {
-      gen8_3DSTATE_VF_INSTANCING(r->builder, i,
-            vec->ve->instance_divisors[i]);
+      for (i = 0; i < attr_count; i++)
+         gen8_3DSTATE_VF_INSTANCING(r->builder, &vec->ve->vf, i);
    }
 
-   gen8_3DSTATE_VF_SGVS(r->builder,
-         false, 0, 0,
-         false, 0, 0);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_SGVS)
+      gen8_3DSTATE_VF_SGVS(r->builder, &vec->ve->vf);
 }
 
 void
@@ -281,7 +269,7 @@ ilo_render_emit_draw_commands_gen8(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen8_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 int
@@ -365,17 +353,13 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r,
                                        const struct ilo_blitter *blitter,
                                        const struct ilo_render_rectlist_session *session)
 {
-   uint32_t op;
-
    ILO_DEV_ASSERT(r->dev, 8, 8);
 
    gen8_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -393,27 +377,8 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r,
    gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0,
          blitter->fb.width, blitter->fb.height);
 
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      op = 0;
-      if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH)
-         op |= GEN8_WM_HZ_DW1_DEPTH_CLEAR;
-      if (blitter->uses & ILO_BLITTER_USE_FB_STENCIL)
-         op |= GEN8_WM_HZ_DW1_STENCIL_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      op = GEN8_WM_HZ_DW1_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      op = GEN8_WM_HZ_DW1_HIZ_RESOLVE;
-      break;
-   default:
-      op = 0;
-      break;
-   }
-
-   gen8_3DSTATE_WM_HZ_OP(r->builder, op, blitter->fb.width,
-         blitter->fb.height, blitter->fb.num_samples);
+   gen8_3DSTATE_WM_HZ_OP(r->builder, &blitter->fb.rs,
+         blitter->fb.width, blitter->fb.height);
 
    ilo_render_pipe_control(r, GEN6_PIPE_CONTROL_WRITE_IMM);
 
diff --git a/src/gallium/drivers/ilo/ilo_render_media.c b/src/gallium/drivers/ilo/ilo_render_media.c
index 387920a912c..a0de0024d61 100644
--- a/src/gallium/drivers/ilo/ilo_render_media.c
+++ b/src/gallium/drivers/ilo/ilo_render_media.c
@@ -30,6 +30,7 @@
 #include "core/ilo_builder_mi.h"
 #include "core/ilo_builder_render.h"
 
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
@@ -206,7 +207,7 @@ ilo_render_emit_launch_grid_commands(struct ilo_render *render,
 
    gen6_state_base_address(render->builder, true);
 
-   gen6_MEDIA_VFE_STATE(render->builder, pcb_size, use_slm);
+   gen6_MEDIA_VFE_STATE(render->builder, &session->compute);
 
    if (pcb_size)
       gen6_MEDIA_CURBE_LOAD(render->builder, pcb, pcb_size);
diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c
index b345dfb4fc4..ad053564294 100644
--- a/src/gallium/drivers/ilo/ilo_render_surface.c
+++ b/src/gallium/drivers/ilo/ilo_render_surface.c
@@ -29,11 +29,65 @@
 
 #include "ilo_common.h"
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
 #define DIRTY(state) (session->pipe_dirty & ILO_DIRTY_ ## state)
 
+static inline uint32_t
+gen6_so_SURFACE_STATE(struct ilo_builder *builder,
+                      const struct pipe_stream_output_target *so,
+                      const struct pipe_stream_output_info *so_info,
+                      int so_index)
+{
+   struct ilo_buffer *buf = ilo_buffer(so->buffer);
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surf;
+
+   ILO_DEV_ASSERT(builder->dev, 6, 6);
+
+   memset(&info, 0, sizeof(info));
+   info.buf = buf;
+   info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB;
+
+   switch (so_info->output[so_index].num_components) {
+   case 1:
+      info.format = GEN6_FORMAT_R32_FLOAT;
+      info.format_size = 4;
+      break;
+   case 2:
+      info.format = GEN6_FORMAT_R32G32_FLOAT;
+      info.format_size = 8;
+      break;
+   case 3:
+      info.format = GEN6_FORMAT_R32G32B32_FLOAT;
+      info.format_size = 12;
+      break;
+   case 4:
+      info.format = GEN6_FORMAT_R32G32B32A32_FLOAT;
+      info.format_size = 16;
+      break;
+   default:
+      assert(!"unexpected SO components length");
+      info.format = GEN6_FORMAT_R32_FLOAT;
+      info.format_size = 4;
+      break;
+   }
+
+   info.struct_size =
+      so_info->stride[so_info->output[so_index].output_buffer] * 4;
+   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
+   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
+
+   memset(&surf, 0, sizeof(surf));
+   ilo_state_surface_init_for_buffer(&surf, builder->dev, &info);
+   surf.bo = info.buf->bo;
+
+   return gen6_SURFACE_STATE(builder, &surf);
+}
+
 static void
 gen6_emit_draw_surface_rt(struct ilo_render *r,
                           const struct ilo_state_vector *vec,
@@ -64,11 +118,9 @@ gen6_emit_draw_surface_rt(struct ilo_render *r,
             (const struct ilo_surface_cso *) fb->state.cbufs[i];
 
          assert(surface->is_rt);
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &surface->u.rt, true);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &surface->u.rt);
       } else {
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &fb->null_rt, true);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &fb->null_rt);
       }
    }
 }
@@ -173,8 +225,7 @@ gen6_emit_draw_surface_view(struct ilo_render *r,
          const struct ilo_view_cso *cso =
             (const struct ilo_view_cso *) view->states[i];
 
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &cso->surface, false);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
       } else {
          surface_state[i] = 0;
       }
@@ -228,12 +279,10 @@ gen6_emit_draw_surface_const(struct ilo_render *r,
    for (i = 0; i < count; i++) {
       const struct ilo_cbuf_cso *cso = &cbuf->cso[i];
 
-      if (cso->resource) {
-         surface_state[i] = gen6_SURFACE_STATE(r->builder,
-               &cso->surface, false);
-      } else {
+      if (cso->resource)
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
+      else
          surface_state[i] = 0;
-      }
    }
 }
 
@@ -406,8 +455,7 @@ gen6_emit_launch_grid_surface_view(struct ilo_render *r,
          const struct ilo_view_cso *cso =
             (const struct ilo_view_cso *) view->states[i];
 
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &cso->surface, false);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
       } else {
          surface_state[i] = 0;
       }
@@ -421,7 +469,8 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
 {
    const struct ilo_shader_state *cs = vec->cs;
    uint32_t *surface_state = r->state.cs.SURFACE_STATE;
-   struct ilo_view_surface view;
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surf;
    int base, count;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
@@ -432,15 +481,22 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
    if (!count)
       return;
 
-   ilo_gpe_init_view_surface_for_buffer(r->dev,
-         ilo_buffer(session->input->buffer),
-         session->input->buffer_offset,
-         session->input->buffer_size,
-         1, PIPE_FORMAT_NONE,
-         false, false, &view);
+   memset(&info, 0, sizeof(info));
+   info.buf = ilo_buffer(session->input->buffer);
+   info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
+   info.format = GEN6_FORMAT_RAW;
+   info.format_size = 1;
+   info.struct_size = 1;
+   info.readonly = true;
+   info.offset = session->input->buffer_offset;
+   info.size = session->input->buffer_size;
+
+   memset(&surf, 0, sizeof(surf));
+   ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
+   surf.bo = info.buf->bo;
 
    assert(count == 1 && session->input->buffer);
-   surface_state[base] = gen6_SURFACE_STATE(r->builder, &view, false);
+   surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf);
 }
 
 static void
@@ -483,14 +539,24 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r,
    for (i = 0; i < count; i++) {
       if (i < vec->global_binding.count && bindings[i].resource) {
          const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource);
-         struct ilo_view_surface view;
+         struct ilo_state_surface_buffer_info info;
+         struct ilo_state_surface surf;
 
          assert(bindings[i].resource->target == PIPE_BUFFER);
 
-         ilo_gpe_init_view_surface_for_buffer(r->dev, buf, 0, buf->bo_size,
-               1, PIPE_FORMAT_NONE, true, true, &view);
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &view, true);
+         memset(&info, 0, sizeof(info));
+         info.buf = buf;
+         info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
+         info.format = GEN6_FORMAT_RAW;
+         info.format_size = 1;
+         info.struct_size = 1;
+         info.size = buf->bo_size;
+
+         memset(&surf, 0, sizeof(surf));
+         ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
+         surf.bo = info.buf->bo;
+
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf);
       } else {
          surface_state[i] = 0;
       }
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index ad4852278d0..be9fd10a84c 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -178,8 +178,8 @@ tex_create_bo(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_bo(&tex->image, bo);
-   intel_bo_unref(bo);
+   intel_bo_unref(tex->image.bo);
+   tex->image.bo = bo;
 
    return true;
 }
@@ -223,7 +223,7 @@ tex_create_hiz(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_aux_bo(&tex->image, bo);
+   tex->image.aux.bo = bo;
 
    if (tex->imported) {
       unsigned lv;
@@ -256,7 +256,7 @@ tex_create_mcs(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_aux_bo(&tex->image, bo);
+   tex->image.aux.bo = bo;
 
    return true;
 }
@@ -267,7 +267,8 @@ tex_destroy(struct ilo_texture *tex)
    if (tex->separate_s8)
       tex_destroy(tex->separate_s8);
 
-   ilo_image_cleanup(&tex->image);
+   intel_bo_unref(tex->image.bo);
+   intel_bo_unref(tex->image.aux.bo);
 
    tex_free_slices(tex);
    FREE(tex);
@@ -287,15 +288,13 @@ tex_alloc_bos(struct ilo_texture *tex)
 
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
-      if (!tex_create_hiz(tex)) {
-         /* Separate Stencil Buffer requires HiZ to be enabled */
-         if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
-             tex->image.separate_stencil)
-            return false;
-      }
+      if (!tex_create_hiz(tex) &&
+          !ilo_image_disable_aux(&tex->image, &is->dev))
+         return false;
       break;
    case ILO_IMAGE_AUX_MCS:
-      if (!tex_create_mcs(tex))
+      if (!tex_create_mcs(tex) &&
+          !ilo_image_disable_aux(&tex->image, &is->dev))
          return false;
       break;
    default:
@@ -328,8 +327,7 @@ tex_import_handle(struct ilo_texture *tex,
       return false;
    }
 
-   ilo_image_set_bo(&tex->image, bo);
-   intel_bo_unref(bo);
+   tex->image.bo = bo;
 
    tex->imported = true;
 
@@ -427,8 +425,8 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    if (!bo)
       return false;
 
-   ilo_buffer_set_bo(&buf->buffer, bo);
-   intel_bo_unref(bo);
+   intel_bo_unref(buf->buffer.bo);
+   buf->buffer.bo = bo;
 
    return true;
 }
@@ -436,7 +434,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
 static void
 buf_destroy(struct ilo_buffer_resource *buf)
 {
-   ilo_buffer_cleanup(&buf->buffer);
+   intel_bo_unref(buf->buffer.bo);
    FREE(buf);
 }
 
@@ -445,6 +443,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
 {
    const struct ilo_screen *is = ilo_screen(screen);
    struct ilo_buffer_resource *buf;
+   unsigned size;
 
    buf = CALLOC_STRUCT(ilo_buffer_resource);
    if (!buf)
@@ -454,8 +453,25 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
    buf->base.screen = screen;
    pipe_reference_init(&buf->base.reference, 1);
 
-   ilo_buffer_init(&buf->buffer, &is->dev,
-         templ->width0, templ->bind, templ->flags);
+   size = templ->width0;
+
+   /*
+    * As noted in ilo_format_translate(), we treat some 3-component formats as
+    * 4-component formats to work around hardware limitations.  Imagine the
+    * case where the vertex buffer holds a single PIPE_FORMAT_R16G16B16_FLOAT
+    * vertex, and buf->bo_size is 6.  The hardware would fail to fetch it at
+    * boundary check because the vertex buffer is expected to hold a
+    * PIPE_FORMAT_R16G16B16A16_FLOAT vertex and that takes at least 8 bytes.
+    *
+    * For the workaround to work, we should add 2 to the bo size.  But that
+    * would waste a page when the bo size is already page aligned.  Let's
+    * round it to page size for now and revisit this when needed.
+    */
+   if ((templ->bind & PIPE_BIND_VERTEX_BUFFER) &&
+       ilo_dev_gen(&is->dev) < ILO_GEN(7.5))
+      size = align(size, 4096);
+
+   ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
 
    if (buf->buffer.bo_size < templ->width0 ||
        buf->buffer.bo_size > ilo_max_resource_size ||
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 918af0820de..94105559b80 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -31,11 +31,10 @@
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "genhw/genhw.h" /* for GEN6_REG_TIMESTAMP */
-#include "core/ilo_fence.h"
-#include "core/ilo_format.h"
 #include "core/intel_winsys.h"
 
 #include "ilo_context.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 #include "ilo_transfer.h" /* for ILO_TRANSFER_MAP_BUFFER_ALIGNMENT */
 #include "ilo_public.h"
@@ -43,8 +42,7 @@
 
 struct pipe_fence_handle {
    struct pipe_reference reference;
-
-   struct ilo_fence fence;
+   struct intel_bo *seqno_bo;
 };
 
 static float
@@ -347,7 +345,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_INDEP_BLEND_FUNC:
       return true;
    case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return (ilo_dev_gen(&is->dev) >= ILO_GEN(7)) ? 2048 : 512;
+      return (ilo_dev_gen(&is->dev) >= ILO_GEN(7.5)) ? 2048 : 512;
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
@@ -458,6 +456,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -641,7 +640,7 @@ ilo_screen_fence_reference(struct pipe_screen *screen,
 
    STATIC_ASSERT(&((struct pipe_fence_handle *) NULL)->reference == NULL);
    if (pipe_reference(&old->reference, &fence->reference)) {
-      ilo_fence_cleanup(&old->fence);
+      intel_bo_unref(old->seqno_bo);
       FREE(old);
    }
 }
@@ -654,10 +653,14 @@ ilo_screen_fence_finish(struct pipe_screen *screen,
    const int64_t wait_timeout = (timeout > INT64_MAX) ? -1 : timeout;
    bool signaled;
 
-   signaled = ilo_fence_wait(&fence->fence, wait_timeout);
+   signaled = (!fence->seqno_bo ||
+         intel_bo_wait(fence->seqno_bo, wait_timeout) == 0);
+
    /* XXX not thread safe */
-   if (signaled)
-      ilo_fence_set_seq_bo(&fence->fence, NULL);
+   if (signaled && fence->seqno_bo) {
+      intel_bo_unref(fence->seqno_bo);
+      fence->seqno_bo = NULL;
+   }
 
    return signaled;
 }
@@ -676,7 +679,6 @@ ilo_screen_fence_signalled(struct pipe_screen *screen,
 struct pipe_fence_handle *
 ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo)
 {
-   struct ilo_screen *is = ilo_screen(screen);
    struct pipe_fence_handle *fence;
 
    fence = CALLOC_STRUCT(pipe_fence_handle);
@@ -685,8 +687,7 @@ ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo)
 
    pipe_reference_init(&fence->reference, 1);
 
-   ilo_fence_init(&fence->fence, &is->dev);
-   ilo_fence_set_seq_bo(&fence->fence, bo);
+   fence->seqno_bo = intel_bo_ref(bo);
 
    return fence;
 }
@@ -696,7 +697,7 @@ ilo_screen_destroy(struct pipe_screen *screen)
 {
    struct ilo_screen *is = ilo_screen(screen);
 
-   ilo_dev_cleanup(&is->dev);
+   intel_winsys_destroy(is->dev.winsys);
 
    FREE(is);
 }
diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c
index 799db2cbfcb..5f2b01017e2 100644
--- a/src/gallium/drivers/ilo/ilo_shader.c
+++ b/src/gallium/drivers/ilo/ilo_shader.c
@@ -27,7 +27,6 @@
 
 #include "genhw/genhw.h" /* for SBE setup */
 #include "core/ilo_builder.h"
-#include "core/ilo_state_3d.h"
 #include "core/intel_winsys.h"
 #include "shader/ilo_shader_internal.h"
 #include "tgsi/tgsi_parse.h"
@@ -557,39 +556,255 @@ ilo_shader_state_search_variant(struct ilo_shader_state *state,
 }
 
 static void
-copy_so_info(struct ilo_shader *sh,
-             const struct pipe_stream_output_info *so_info)
+init_shader_urb(const struct ilo_shader *kernel,
+                const struct ilo_shader_state *state,
+                struct ilo_state_shader_urb_info *urb)
 {
-   unsigned i, attr;
+   urb->cv_input_attr_count = kernel->in.count;
+   urb->read_base = 0;
+   urb->read_count = kernel->in.count;
 
-   if (!so_info->num_outputs)
+   urb->output_attr_count = kernel->out.count;
+   urb->user_cull_enables = 0x0;
+   urb->user_clip_enables = 0x0;
+}
+
+static void
+init_shader_kernel(const struct ilo_shader *kernel,
+                   const struct ilo_shader_state *state,
+                   struct ilo_state_shader_kernel_info *kern)
+{
+   kern->offset = 0;
+   kern->grf_start = kernel->in.start_grf;
+   kern->pcb_attr_count =
+      (kernel->pcb.cbuf0_size + kernel->pcb.clip_state_size + 15) / 16;
+   kern->scratch_size = 0;
+}
+
+static void
+init_shader_resource(const struct ilo_shader *kernel,
+                     const struct ilo_shader_state *state,
+                     struct ilo_state_shader_resource_info *resource)
+{
+   resource->sampler_count = state->info.num_samplers;
+   resource->surface_count = 0;
+   resource->has_uav = false;
+}
+
+static void
+init_vs(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   struct ilo_state_vs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_urb(kernel, state, &info.urb);
+   init_shader_kernel(kernel, state, &info.kernel);
+   init_shader_resource(kernel, state, &info.resource);
+   info.dispatch_enable = true;
+   info.stats_enable = true;
+
+   if (ilo_dev_gen(state->info.dev) == ILO_GEN(6) && kernel->stream_output) {
+      struct ilo_state_gs_info gs_info;
+
+      memset(&gs_info, 0, sizeof(gs_info));
+
+      gs_info.urb.cv_input_attr_count = kernel->out.count;
+      gs_info.urb.read_count = kernel->out.count;
+      gs_info.kernel.grf_start = kernel->gs_start_grf;
+      gs_info.sol.sol_enable = true;
+      gs_info.sol.stats_enable = true;
+      gs_info.sol.render_disable = kernel->variant.u.vs.rasterizer_discard;
+      gs_info.sol.svbi_post_inc = kernel->svbi_post_inc;
+      gs_info.sol.tristrip_reorder = GEN7_REORDER_LEADING;
+      gs_info.dispatch_enable = true;
+      gs_info.stats_enable = true;
+
+      ilo_state_vs_init(&kernel->cso.vs_sol.vs, state->info.dev, &info);
+      ilo_state_gs_init(&kernel->cso.vs_sol.sol, state->info.dev, &gs_info);
+   } else {
+      ilo_state_vs_init(&kernel->cso.vs, state->info.dev, &info);
+   }
+}
+
+static void
+init_gs(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   const struct pipe_stream_output_info *so_info = &state->info.stream_output;
+   struct ilo_state_gs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_urb(kernel, state, &info.urb);
+   init_shader_kernel(kernel, state, &info.kernel);
+   init_shader_resource(kernel, state, &info.resource);
+   info.dispatch_enable = true;
+   info.stats_enable = true;
+
+   if (so_info->num_outputs > 0) {
+      info.sol.sol_enable = true;
+      info.sol.stats_enable = true;
+      info.sol.render_disable = kernel->variant.u.gs.rasterizer_discard;
+      info.sol.tristrip_reorder = GEN7_REORDER_LEADING;
+   }
+
+   ilo_state_gs_init(&kernel->cso.gs, state->info.dev, &info);
+}
+
+static void
+init_ps(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   struct ilo_state_ps_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_kernel(kernel, state, &info.kernel_8);
+   init_shader_resource(kernel, state, &info.resource);
+
+   info.io.has_rt_write = true;
+   info.io.posoffset = GEN6_POSOFFSET_NONE;
+   info.io.attr_count = kernel->in.count;
+   info.io.use_z = kernel->in.has_pos;
+   info.io.use_w = kernel->in.has_pos;
+   info.io.use_coverage_mask = false;
+   info.io.pscdepth = (kernel->out.has_pos) ?
+      GEN7_PSCDEPTH_ON : GEN7_PSCDEPTH_OFF;
+   info.io.write_pixel_mask = kernel->has_kill;
+   info.io.write_omask = false;
+
+   info.params.sample_mask = 0x1;
+   info.params.earlyz_control_psexec = false;
+   info.params.alpha_may_kill = false;
+   info.params.dual_source_blending = false;
+   info.params.has_writeable_rt = true;
+
+   info.valid_kernels = GEN6_PS_DISPATCH_8;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 284:
+    *
+    *     "(MSDISPMODE_PERSAMPLE) This is the high-quality multisample mode
+    *      where (over and above PERPIXEL mode) the PS is run for each covered
+    *      sample. This mode is also used for "normal" non-multisample
+    *      rendering (aka 1X), given Number of Multisamples is programmed to
+    *      NUMSAMPLES_1."
+    */
+   info.per_sample_dispatch = true;
+
+   info.rt_clear_enable = false;
+   info.rt_resolve_enable = false;
+   info.cv_per_sample_interp = false;
+   info.cv_has_earlyz_op = false;
+   info.sample_count_one = true;
+   info.cv_has_depth_buffer = true;
+
+   ilo_state_ps_init(&kernel->cso.ps, state->info.dev, &info);
+
+   /* remember current parameters */
+   kernel->ps_params = info.params;
+}
+
+static void
+init_sol(struct ilo_shader *kernel,
+         const struct ilo_dev *dev,
+         const struct pipe_stream_output_info *so_info,
+         bool rasterizer_discard)
+{
+   struct ilo_state_sol_decl_info decls[4][PIPE_MAX_SO_OUTPUTS];
+   unsigned buf_offsets[PIPE_MAX_SO_BUFFERS];
+   struct ilo_state_sol_info info;
+   unsigned i;
+
+   if (!so_info->num_outputs) {
+      ilo_state_sol_init_disabled(&kernel->sol, dev, rasterizer_discard);
       return;
+   }
+
+   memset(&info, 0, sizeof(info));
+   info.data = kernel->sol_data;
+   info.data_size = sizeof(kernel->sol_data);
+   info.sol_enable = true;
+   info.stats_enable = true;
+   info.tristrip_reorder = GEN7_REORDER_TRAILING;
+   info.render_disable = rasterizer_discard;
+   info.render_stream = 0;
+
+   for (i = 0; i < 4; i++) {
+      info.buffer_strides[i] = so_info->stride[i] * 4;
 
-   sh->so_info = *so_info;
+      info.streams[i].cv_vue_attr_count = kernel->out.count;
+      info.streams[i].decls = decls[i];
+   }
 
+   memset(decls, 0, sizeof(decls));
+   memset(buf_offsets, 0, sizeof(buf_offsets));
    for (i = 0; i < so_info->num_outputs; i++) {
+      const unsigned stream = so_info->output[i].stream;
+      const unsigned buffer = so_info->output[i].output_buffer;
+      struct ilo_state_sol_decl_info *decl;
+      unsigned attr;
+
       /* figure out which attribute is sourced */
-      for (attr = 0; attr < sh->out.count; attr++) {
-         const int reg_idx = sh->out.register_indices[attr];
+      for (attr = 0; attr < kernel->out.count; attr++) {
+         const int reg_idx = kernel->out.register_indices[attr];
          if (reg_idx == so_info->output[i].register_index)
             break;
       }
-
-      if (attr < sh->out.count) {
-         sh->so_info.output[i].register_index = attr;
-      }
-      else {
+      if (attr >= kernel->out.count) {
          assert(!"stream output an undefined register");
-         sh->so_info.output[i].register_index = 0;
+         attr = 0;
       }
 
+      if (info.streams[stream].vue_read_count < attr + 1)
+         info.streams[stream].vue_read_count = attr + 1;
+
+      /* pad with holes first */
+      while (buf_offsets[buffer] < so_info->output[i].dst_offset) {
+         int num_dwords;
+
+         num_dwords = so_info->output[i].dst_offset - buf_offsets[buffer];
+         if (num_dwords > 4)
+            num_dwords = 4;
+
+         assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream]));
+         decl = &decls[stream][info.streams[stream].decl_count];
+
+         decl->attr = 0;
+         decl->is_hole = true;
+         decl->component_base = 0;
+         decl->component_count = num_dwords;
+         decl->buffer = buffer;
+
+         info.streams[stream].decl_count++;
+         buf_offsets[buffer] += num_dwords;
+      }
+      assert(buf_offsets[buffer] == so_info->output[i].dst_offset);
+
+      assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream]));
+      decl = &decls[stream][info.streams[stream].decl_count];
+
+      decl->attr = attr;
+      decl->is_hole = false;
       /* PSIZE is at W channel */
-      if (sh->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) {
+      if (kernel->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) {
          assert(so_info->output[i].start_component == 0);
          assert(so_info->output[i].num_components == 1);
-         sh->so_info.output[i].start_component = 3;
+         decl->component_base = 3;
+         decl->component_count = 1;
+      } else {
+         decl->component_base = so_info->output[i].start_component;
+         decl->component_count = so_info->output[i].num_components;
       }
+      decl->buffer = buffer;
+
+      info.streams[stream].decl_count++;
+      buf_offsets[buffer] += so_info->output[i].num_components;
    }
+
+   ilo_state_sol_init(&kernel->sol, dev, &info);
 }
 
 /**
@@ -599,17 +814,20 @@ static struct ilo_shader *
 ilo_shader_state_add_variant(struct ilo_shader_state *state,
                              const struct ilo_shader_variant *variant)
 {
+   bool rasterizer_discard = false;
    struct ilo_shader *sh;
 
    switch (state->info.type) {
    case PIPE_SHADER_VERTEX:
       sh = ilo_shader_compile_vs(state, variant);
+      rasterizer_discard = variant->u.vs.rasterizer_discard;
       break;
    case PIPE_SHADER_FRAGMENT:
       sh = ilo_shader_compile_fs(state, variant);
       break;
    case PIPE_SHADER_GEOMETRY:
       sh = ilo_shader_compile_gs(state, variant);
+      rasterizer_discard = variant->u.gs.rasterizer_discard;
       break;
    case PIPE_SHADER_COMPUTE:
       sh = ilo_shader_compile_cs(state, variant);
@@ -625,7 +843,8 @@ ilo_shader_state_add_variant(struct ilo_shader_state *state,
 
    sh->variant = *variant;
 
-   copy_so_info(sh, &state->info.stream_output);
+   init_sol(sh, state->info.dev, &state->info.stream_output,
+         rasterizer_discard);
 
    ilo_shader_state_add_shader(state, sh);
 
@@ -665,13 +884,13 @@ ilo_shader_state_use_variant(struct ilo_shader_state *state,
    if (construct_cso) {
       switch (state->info.type) {
       case PIPE_SHADER_VERTEX:
-         ilo_gpe_init_vs_cso(state->info.dev, state, &sh->cso);
+         init_vs(sh, state);
          break;
       case PIPE_SHADER_GEOMETRY:
-         ilo_gpe_init_gs_cso(state->info.dev, state, &sh->cso);
+         init_gs(sh, state);
          break;
       case PIPE_SHADER_FRAGMENT:
-         ilo_gpe_init_fs_cso(state->info.dev, state, &sh->cso);
+         init_ps(sh, state);
          break;
       default:
          break;
@@ -789,16 +1008,33 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader,
                          const struct ilo_state_vector *vec,
                          uint32_t dirty)
 {
-   const struct ilo_shader * const cur = shader->shader;
    struct ilo_shader_variant variant;
+   bool changed = false;
 
-   if (!(shader->info.non_orthogonal_states & dirty))
-      return false;
+   if (shader->info.non_orthogonal_states & dirty) {
+      const struct ilo_shader * const old = shader->shader;
+
+      ilo_shader_variant_init(&variant, &shader->info, vec);
+      ilo_shader_state_use_variant(shader, &variant);
+      changed = (shader->shader != old);
+   }
 
-   ilo_shader_variant_init(&variant, &shader->info, vec);
-   ilo_shader_state_use_variant(shader, &variant);
+   if (shader->info.type == PIPE_SHADER_FRAGMENT) {
+      struct ilo_shader *kernel = shader->shader;
 
-   return (shader->shader != cur);
+      if (kernel->ps_params.sample_mask != vec->sample_mask ||
+          kernel->ps_params.alpha_may_kill != vec->blend->alpha_may_kill) {
+         kernel->ps_params.sample_mask = vec->sample_mask;
+         kernel->ps_params.alpha_may_kill = vec->blend->alpha_may_kill;
+
+         ilo_state_ps_set_params(&kernel->cso.ps, shader->info.dev,
+               &kernel->ps_params);
+
+         changed = true;
+      }
+   }
+
+   return changed;
 }
 
 static int
@@ -829,82 +1065,104 @@ route_attr(const int *semantics, const int *indices, int len,
  * \return true if a different routing is selected
  */
 bool
-ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
-                                 const struct ilo_shader_state *source,
-                                 const struct ilo_rasterizer_state *rasterizer)
+ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader,
+                             const struct ilo_shader_state *source,
+                             const struct ilo_rasterizer_state *rasterizer)
 {
-   const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable;
+   const bool is_point = true;
    const bool light_twoside = rasterizer->state.light_twoside;
+   const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable;
+   const int sprite_coord_mode = rasterizer->state.sprite_coord_mode;
    struct ilo_shader *kernel = shader->shader;
    struct ilo_kernel_routing *routing = &kernel->routing;
+   struct ilo_state_sbe_swizzle_info swizzles[ILO_STATE_SBE_MAX_SWIZZLE_COUNT];
+   struct ilo_state_sbe_info info;
    const int *src_semantics, *src_indices;
-   int src_len, max_src_slot;
+   int src_skip, src_len, src_slot;
    int dst_len, dst_slot;
 
-   /* we are constructing 3DSTATE_SBE here */
-   ILO_DEV_ASSERT(shader->info.dev, 6, 8);
-
    assert(kernel);
 
    if (source) {
       assert(source->shader);
+
       src_semantics = source->shader->out.semantic_names;
       src_indices = source->shader->out.semantic_indices;
       src_len = source->shader->out.count;
-   }
-   else {
+      src_skip = 0;
+
+      assert(src_len >= 2 &&
+             src_semantics[0] == TGSI_SEMANTIC_PSIZE &&
+             src_semantics[1] == TGSI_SEMANTIC_POSITION);
+
+      /*
+       * skip PSIZE and POSITION (how about the optional CLIPDISTs?), unless
+       * they are all the source shader has and FS needs to read some
+       * attributes.
+       */
+      if (src_len > 2 || !kernel->in.count) {
+         src_semantics += 2;
+         src_indices += 2;
+         src_len -= 2;
+         src_skip = 2;
+      }
+   } else {
       src_semantics = kernel->in.semantic_names;
       src_indices = kernel->in.semantic_indices;
       src_len = kernel->in.count;
+      src_skip = 0;
    }
 
    /* no change */
-   if (kernel->routing_initialized &&
-       routing->source_skip + routing->source_len <= src_len &&
-       kernel->routing_sprite_coord_enable == sprite_coord_enable &&
-       !memcmp(kernel->routing_src_semantics,
-          &src_semantics[routing->source_skip],
-          sizeof(kernel->routing_src_semantics[0]) * routing->source_len) &&
-       !memcmp(kernel->routing_src_indices,
-          &src_indices[routing->source_skip],
-          sizeof(kernel->routing_src_indices[0]) * routing->source_len))
+   if (routing->initialized &&
+       routing->is_point == is_point &&
+       routing->light_twoside == light_twoside &&
+       routing->sprite_coord_enable == sprite_coord_enable &&
+       routing->sprite_coord_mode == sprite_coord_mode &&
+       routing->src_len <= src_len &&
+       !memcmp(routing->src_semantics, src_semantics,
+          sizeof(src_semantics[0]) * routing->src_len) &&
+       !memcmp(routing->src_indices, src_indices,
+          sizeof(src_indices[0]) * routing->src_len))
       return false;
 
-   if (source) {
-      /* skip PSIZE and POSITION (how about the optional CLIPDISTs?) */
-      assert(src_semantics[0] == TGSI_SEMANTIC_PSIZE);
-      assert(src_semantics[1] == TGSI_SEMANTIC_POSITION);
-      routing->source_skip = 2;
-
-      routing->source_len = src_len - routing->source_skip;
-      src_semantics += routing->source_skip;
-      src_indices += routing->source_skip;
-   }
-   else {
-      routing->source_skip = 0;
-      routing->source_len = src_len;
-   }
-
-   routing->const_interp_enable = kernel->in.const_interp_enable;
-   routing->point_sprite_enable = 0;
-   routing->swizzle_enable = false;
-
-   assert(kernel->in.count <= Elements(routing->swizzles));
-   dst_len = MIN2(kernel->in.count, Elements(routing->swizzles));
-   max_src_slot = -1;
+   routing->is_point = is_point;
+   routing->light_twoside = light_twoside;
+   routing->sprite_coord_enable = sprite_coord_enable;
+   routing->sprite_coord_mode = sprite_coord_mode;
+
+   assert(kernel->in.count <= Elements(swizzles));
+   dst_len = MIN2(kernel->in.count, Elements(swizzles));
+
+   memset(&swizzles, 0, sizeof(swizzles));
+   memset(&info, 0, sizeof(info));
+
+   info.attr_count = dst_len;
+   info.cv_vue_attr_count = src_skip + src_len;
+   info.vue_read_base = src_skip;
+   info.vue_read_count = 0;
+   info.has_min_read_count = true;
+   info.swizzle_enable = false;
+   info.swizzle_16_31 = false;
+   info.swizzle_count = 0;
+   info.swizzles = swizzles;
+   info.const_interp_enables = kernel->in.const_interp_enable;
+   info.point_sprite_enables = 0x0;
+   info.point_sprite_origin_lower_left =
+      (sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
+   info.cv_is_point = is_point;
 
    for (dst_slot = 0; dst_slot < dst_len; dst_slot++) {
       const int semantic = kernel->in.semantic_names[dst_slot];
       const int index = kernel->in.semantic_indices[dst_slot];
-      int src_slot;
 
       if (semantic == TGSI_SEMANTIC_GENERIC &&
           (sprite_coord_enable & (1 << index)))
-         routing->point_sprite_enable |= 1 << dst_slot;
+         info.point_sprite_enables |= 1 << dst_slot;
 
       if (source) {
-         src_slot = route_attr(src_semantics, src_indices,
-               routing->source_len, semantic, index);
+         src_slot = route_attr(src_semantics, src_indices, src_len,
+               semantic, index);
 
          /*
           * The source shader stage does not output this attribute.  The value
@@ -918,58 +1176,47 @@ ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
           */
          if (src_slot < 0)
             src_slot = 0;
-      }
-      else {
+      } else {
          src_slot = dst_slot;
       }
 
-      routing->swizzles[dst_slot] = src_slot;
-
       /* use the following slot for two-sided lighting */
       if (semantic == TGSI_SEMANTIC_COLOR && light_twoside &&
-          src_slot + 1 < routing->source_len &&
+          src_slot + 1 < src_len &&
           src_semantics[src_slot + 1] == TGSI_SEMANTIC_BCOLOR &&
           src_indices[src_slot + 1] == index) {
-         routing->swizzles[dst_slot] |= GEN8_SBE_SWIZ_INPUTATTR_FACING;
+         swizzles[dst_slot].attr_select = GEN6_INPUTATTR_FACING;
+         swizzles[dst_slot].attr = src_slot;
+         info.swizzle_enable = true;
          src_slot++;
+      } else {
+         swizzles[dst_slot].attr_select = GEN6_INPUTATTR_NORMAL;
+         swizzles[dst_slot].attr = src_slot;
+         if (src_slot != dst_slot)
+            info.swizzle_enable = true;
       }
 
-      if (routing->swizzles[dst_slot] != dst_slot)
-         routing->swizzle_enable = true;
+      swizzles[dst_slot].force_zeros = false;
 
-      if (max_src_slot < src_slot)
-         max_src_slot = src_slot;
+      if (info.vue_read_count < src_slot + 1)
+         info.vue_read_count = src_slot + 1;
    }
 
-   memset(&routing->swizzles[dst_slot], 0, sizeof(routing->swizzles) -
-         sizeof(routing->swizzles[0]) * dst_slot);
+   if (info.swizzle_enable)
+      info.swizzle_count = dst_len;
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "It is UNDEFINED to set this field (Vertex URB Entry Read Length) to
-    *      0 indicating no Vertex URB data to be read.
-    *
-    *      This field should be set to the minimum length required to read the
-    *      maximum source attribute. The maximum source attribute is indicated
-    *      by the maximum value of the enabled Attribute # Source Attribute if
-    *      Attribute Swizzle Enable is set, Number of Output Attributes-1 if
-    *      enable is not set.
-    *
-    *        read_length = ceiling((max_source_attr+1)/2)
-    *
-    *      [errata] Corruption/Hang possible if length programmed larger than
-    *      recommended"
-    */
-   routing->source_len = max_src_slot + 1;
+   if (routing->initialized)
+      ilo_state_sbe_set_info(&routing->sbe, shader->info.dev, &info);
+   else
+      ilo_state_sbe_init(&routing->sbe, shader->info.dev, &info);
+
+   routing->src_len = info.vue_read_count;
+   memcpy(routing->src_semantics, src_semantics,
+         sizeof(src_semantics[0]) * routing->src_len);
+   memcpy(routing->src_indices, src_indices,
+         sizeof(src_indices[0]) * routing->src_len);
 
-   /* remember the states of the source */
-   kernel->routing_initialized = true;
-   kernel->routing_sprite_coord_enable = sprite_coord_enable;
-   memcpy(kernel->routing_src_semantics, src_semantics,
-         sizeof(kernel->routing_src_semantics[0]) * routing->source_len);
-   memcpy(kernel->routing_src_indices, src_indices,
-         sizeof(kernel->routing_src_indices[0]) * routing->source_len);
+   routing->initialized = true;
 
    return true;
 }
@@ -1147,7 +1394,7 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
 /**
  * Return the CSO of the selected kernel.
  */
-const struct ilo_shader_cso *
+const union ilo_shader_cso *
 ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader)
 {
    const struct ilo_shader *kernel = shader->shader;
@@ -1163,22 +1410,28 @@ ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader)
 const struct pipe_stream_output_info *
 ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader)
 {
+   return &shader->info.stream_output;
+}
+
+const struct ilo_state_sol *
+ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader)
+{
    const struct ilo_shader *kernel = shader->shader;
 
    assert(kernel);
 
-   return &kernel->so_info;
+   return &kernel->sol;
 }
 
 /**
  * Return the routing info of the selected kernel.
  */
-const struct ilo_kernel_routing *
-ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader)
+const struct ilo_state_sbe *
+ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader)
 {
    const struct ilo_shader *kernel = shader->shader;
 
    assert(kernel);
 
-   return &kernel->routing;
+   return &kernel->routing.sbe;
 }
diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h
index 8a359001bb8..d9f02a4746a 100644
--- a/src/gallium/drivers/ilo/ilo_shader.h
+++ b/src/gallium/drivers/ilo/ilo_shader.h
@@ -28,6 +28,8 @@
 #ifndef ILO_SHADER_H
 #define ILO_SHADER_H
 
+#include "core/ilo_state_shader.h"
+
 #include "ilo_common.h"
 
 enum ilo_kernel_param {
@@ -81,23 +83,28 @@ enum ilo_kernel_param {
    ILO_KERNEL_PARAM_COUNT,
 };
 
-struct ilo_kernel_routing {
-   uint32_t const_interp_enable;
-   uint32_t point_sprite_enable;
-   unsigned source_skip, source_len;
-
-   bool swizzle_enable;
-   uint16_t swizzles[16];
-};
-
 struct intel_bo;
 struct ilo_builder;
 struct ilo_rasterizer_state;
 struct ilo_shader_cache;
 struct ilo_shader_state;
-struct ilo_shader_cso;
+struct ilo_state_sbe;
+struct ilo_state_sol;
 struct ilo_state_vector;
 
+union ilo_shader_cso {
+   struct ilo_state_vs vs;
+   struct ilo_state_hs hs;
+   struct ilo_state_ds ds;
+   struct ilo_state_gs gs;
+   struct ilo_state_ps ps;
+
+   struct {
+      struct ilo_state_vs vs;
+      struct ilo_state_gs sol;
+   } vs_sol;
+};
+
 struct ilo_shader_cache *
 ilo_shader_cache_create(void);
 
@@ -151,9 +158,9 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader,
                          uint32_t dirty);
 
 bool
-ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
-                                 const struct ilo_shader_state *source,
-                                 const struct ilo_rasterizer_state *rasterizer);
+ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader,
+                             const struct ilo_shader_state *source,
+                             const struct ilo_rasterizer_state *rasterizer);
 
 uint32_t
 ilo_shader_get_kernel_offset(const struct ilo_shader_state *shader);
@@ -162,13 +169,16 @@ int
 ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
                             enum ilo_kernel_param param);
 
-const struct ilo_shader_cso *
+const union ilo_shader_cso *
 ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader);
 
 const struct pipe_stream_output_info *
 ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader);
 
-const struct ilo_kernel_routing *
-ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader);
+const struct ilo_state_sol *
+ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader);
+
+const struct ilo_state_sbe *
+ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader);
 
 #endif /* ILO_SHADER_H */
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index b1bd49a0b6c..63534f33fa7 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -25,16 +25,288 @@
  *    Chia-I Wu <[email protected]>
  */
 
-#include "core/ilo_state_3d.h"
+#include "util/u_dual_blend.h"
 #include "util/u_dynarray.h"
+#include "util/u_framebuffer.h"
 #include "util/u_helpers.h"
+#include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
 
 #include "ilo_context.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 
+/**
+ * Translate a pipe primitive type to the matching hardware primitive type.
+ */
+static enum gen_3dprim_type
+ilo_translate_draw_mode(unsigned mode)
+{
+   static const enum gen_3dprim_type prim_mapping[PIPE_PRIM_MAX] = {
+      [PIPE_PRIM_POINTS]                     = GEN6_3DPRIM_POINTLIST,
+      [PIPE_PRIM_LINES]                      = GEN6_3DPRIM_LINELIST,
+      [PIPE_PRIM_LINE_LOOP]                  = GEN6_3DPRIM_LINELOOP,
+      [PIPE_PRIM_LINE_STRIP]                 = GEN6_3DPRIM_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES]                  = GEN6_3DPRIM_TRILIST,
+      [PIPE_PRIM_TRIANGLE_STRIP]             = GEN6_3DPRIM_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN]               = GEN6_3DPRIM_TRIFAN,
+      [PIPE_PRIM_QUADS]                      = GEN6_3DPRIM_QUADLIST,
+      [PIPE_PRIM_QUAD_STRIP]                 = GEN6_3DPRIM_QUADSTRIP,
+      [PIPE_PRIM_POLYGON]                    = GEN6_3DPRIM_POLYGON,
+      [PIPE_PRIM_LINES_ADJACENCY]            = GEN6_3DPRIM_LINELIST_ADJ,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY]       = GEN6_3DPRIM_LINESTRIP_ADJ,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY]        = GEN6_3DPRIM_TRILIST_ADJ,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]   = GEN6_3DPRIM_TRISTRIP_ADJ,
+   };
+
+   assert(prim_mapping[mode]);
+
+   return prim_mapping[mode];
+}
+
+static enum gen_index_format
+ilo_translate_index_size(unsigned index_size)
+{
+   switch (index_size) {
+   case 1:                             return GEN6_INDEX_BYTE;
+   case 2:                             return GEN6_INDEX_WORD;
+   case 4:                             return GEN6_INDEX_DWORD;
+   default:
+      assert(!"unknown index size");
+      return GEN6_INDEX_BYTE;
+   }
+}
+
+static enum gen_mip_filter
+ilo_translate_mip_filter(unsigned filter)
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:    return GEN6_MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:     return GEN6_MIPFILTER_LINEAR;
+   case PIPE_TEX_MIPFILTER_NONE:       return GEN6_MIPFILTER_NONE;
+   default:
+      assert(!"unknown mipfilter");
+      return GEN6_MIPFILTER_NONE;
+   }
+}
+
+static int
+ilo_translate_img_filter(unsigned filter)
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:       return GEN6_MAPFILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:        return GEN6_MAPFILTER_LINEAR;
+   default:
+      assert(!"unknown sampler filter");
+      return GEN6_MAPFILTER_NEAREST;
+   }
+}
+
+static enum gen_texcoord_mode
+ilo_translate_address_wrap(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_CLAMP:           return GEN8_TEXCOORDMODE_HALF_BORDER;
+   case PIPE_TEX_WRAP_REPEAT:          return GEN6_TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:   return GEN6_TEXCOORDMODE_CLAMP;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return GEN6_TEXCOORDMODE_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:   return GEN6_TEXCOORDMODE_MIRROR;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(!"unknown sampler wrap mode");
+      return GEN6_TEXCOORDMODE_WRAP;
+   }
+}
+
+static enum gen_aniso_ratio
+ilo_translate_max_anisotropy(unsigned max_anisotropy)
+{
+   switch (max_anisotropy) {
+   case 0: case 1: case 2:             return GEN6_ANISORATIO_2;
+   case 3: case 4:                     return GEN6_ANISORATIO_4;
+   case 5: case 6:                     return GEN6_ANISORATIO_6;
+   case 7: case 8:                     return GEN6_ANISORATIO_8;
+   case 9: case 10:                    return GEN6_ANISORATIO_10;
+   case 11: case 12:                   return GEN6_ANISORATIO_12;
+   case 13: case 14:                   return GEN6_ANISORATIO_14;
+   default:                            return GEN6_ANISORATIO_16;
+   }
+}
+
+static enum gen_prefilter_op
+ilo_translate_shadow_func(unsigned func)
+{
+   /*
+    * For PIPE_FUNC_x, the reference value is on the left-hand side of the
+    * comparison, and 1.0 is returned when the comparison is true.
+    *
+    * For GEN6_PREFILTEROP_x, the reference value is on the right-hand side of
+    * the comparison, and 0.0 is returned when the comparison is true.
+    */
+   switch (func) {
+   case PIPE_FUNC_NEVER:               return GEN6_PREFILTEROP_ALWAYS;
+   case PIPE_FUNC_LESS:                return GEN6_PREFILTEROP_LEQUAL;
+   case PIPE_FUNC_EQUAL:               return GEN6_PREFILTEROP_NOTEQUAL;
+   case PIPE_FUNC_LEQUAL:              return GEN6_PREFILTEROP_LESS;
+   case PIPE_FUNC_GREATER:             return GEN6_PREFILTEROP_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:            return GEN6_PREFILTEROP_EQUAL;
+   case PIPE_FUNC_GEQUAL:              return GEN6_PREFILTEROP_GREATER;
+   case PIPE_FUNC_ALWAYS:              return GEN6_PREFILTEROP_NEVER;
+   default:
+      assert(!"unknown shadow compare function");
+      return GEN6_PREFILTEROP_NEVER;
+   }
+}
+
+static enum gen_front_winding
+ilo_translate_front_ccw(unsigned front_ccw)
+{
+   return (front_ccw) ? GEN6_FRONTWINDING_CCW : GEN6_FRONTWINDING_CW;
+}
+
+static enum gen_cull_mode
+ilo_translate_cull_face(unsigned cull_face)
+{
+   switch (cull_face) {
+   case PIPE_FACE_NONE:                return GEN6_CULLMODE_NONE;
+   case PIPE_FACE_FRONT:               return GEN6_CULLMODE_FRONT;
+   case PIPE_FACE_BACK:                return GEN6_CULLMODE_BACK;
+   case PIPE_FACE_FRONT_AND_BACK:      return GEN6_CULLMODE_BOTH;
+   default:
+      assert(!"unknown face culling");
+      return GEN6_CULLMODE_NONE;
+   }
+}
+
+static enum gen_fill_mode
+ilo_translate_poly_mode(unsigned poly_mode)
+{
+   switch (poly_mode) {
+   case PIPE_POLYGON_MODE_FILL:        return GEN6_FILLMODE_SOLID;
+   case PIPE_POLYGON_MODE_LINE:        return GEN6_FILLMODE_WIREFRAME;
+   case PIPE_POLYGON_MODE_POINT:       return GEN6_FILLMODE_POINT;
+   default:
+      assert(!"unknown polygon mode");
+      return GEN6_FILLMODE_SOLID;
+   }
+}
+
+static enum gen_pixel_location
+ilo_translate_half_pixel_center(bool half_pixel_center)
+{
+   return (half_pixel_center) ? GEN6_PIXLOC_CENTER : GEN6_PIXLOC_UL_CORNER;
+}
+
+static enum gen_compare_function
+ilo_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:               return GEN6_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:                return GEN6_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_EQUAL:               return GEN6_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_LEQUAL:              return GEN6_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:             return GEN6_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_NOTEQUAL:            return GEN6_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_GEQUAL:              return GEN6_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_ALWAYS:              return GEN6_COMPAREFUNCTION_ALWAYS;
+   default:
+      assert(!"unknown compare function");
+      return GEN6_COMPAREFUNCTION_NEVER;
+   }
+}
+
+static enum gen_stencil_op
+ilo_translate_stencil_op(unsigned stencil_op)
+{
+   switch (stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:          return GEN6_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:          return GEN6_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:       return GEN6_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:          return GEN6_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:          return GEN6_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:     return GEN6_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:     return GEN6_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:        return GEN6_STENCILOP_INVERT;
+   default:
+      assert(!"unknown stencil op");
+      return GEN6_STENCILOP_KEEP;
+   }
+}
+
+static enum gen_logic_op
+ilo_translate_logicop(unsigned logicop)
+{
+   switch (logicop) {
+   case PIPE_LOGICOP_CLEAR:            return GEN6_LOGICOP_CLEAR;
+   case PIPE_LOGICOP_NOR:              return GEN6_LOGICOP_NOR;
+   case PIPE_LOGICOP_AND_INVERTED:     return GEN6_LOGICOP_AND_INVERTED;
+   case PIPE_LOGICOP_COPY_INVERTED:    return GEN6_LOGICOP_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_REVERSE:      return GEN6_LOGICOP_AND_REVERSE;
+   case PIPE_LOGICOP_INVERT:           return GEN6_LOGICOP_INVERT;
+   case PIPE_LOGICOP_XOR:              return GEN6_LOGICOP_XOR;
+   case PIPE_LOGICOP_NAND:             return GEN6_LOGICOP_NAND;
+   case PIPE_LOGICOP_AND:              return GEN6_LOGICOP_AND;
+   case PIPE_LOGICOP_EQUIV:            return GEN6_LOGICOP_EQUIV;
+   case PIPE_LOGICOP_NOOP:             return GEN6_LOGICOP_NOOP;
+   case PIPE_LOGICOP_OR_INVERTED:      return GEN6_LOGICOP_OR_INVERTED;
+   case PIPE_LOGICOP_COPY:             return GEN6_LOGICOP_COPY;
+   case PIPE_LOGICOP_OR_REVERSE:       return GEN6_LOGICOP_OR_REVERSE;
+   case PIPE_LOGICOP_OR:               return GEN6_LOGICOP_OR;
+   case PIPE_LOGICOP_SET:              return GEN6_LOGICOP_SET;
+   default:
+      assert(!"unknown logicop function");
+      return GEN6_LOGICOP_CLEAR;
+   }
+}
+
+static int
+ilo_translate_blend_func(unsigned blend)
+{
+   switch (blend) {
+   case PIPE_BLEND_ADD:                return GEN6_BLENDFUNCTION_ADD;
+   case PIPE_BLEND_SUBTRACT:           return GEN6_BLENDFUNCTION_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:   return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT;
+   case PIPE_BLEND_MIN:                return GEN6_BLENDFUNCTION_MIN;
+   case PIPE_BLEND_MAX:                return GEN6_BLENDFUNCTION_MAX;
+   default:
+      assert(!"unknown blend function");
+      return GEN6_BLENDFUNCTION_ADD;
+   }
+}
+
+static int
+ilo_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:                return GEN6_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:          return GEN6_BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:          return GEN6_BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:          return GEN6_BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_DST_COLOR:          return GEN6_BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:        return GEN6_BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:        return GEN6_BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:         return GEN6_BLENDFACTOR_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:         return GEN6_BLENDFACTOR_SRC1_ALPHA;
+   case PIPE_BLENDFACTOR_ZERO:               return GEN6_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:      return GEN6_BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:      return GEN6_BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:      return GEN6_BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:      return GEN6_BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:    return GEN6_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:    return GEN6_BLENDFACTOR_INV_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:     return GEN6_BLENDFACTOR_INV_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:     return GEN6_BLENDFACTOR_INV_SRC1_ALPHA;
+   default:
+      assert(!"unknown blend factor");
+      return GEN6_BLENDFACTOR_ONE;
+   }
+}
+
 static void
 finalize_shader_states(struct ilo_state_vector *vec)
 {
@@ -78,7 +350,7 @@ finalize_shader_states(struct ilo_state_vector *vec)
       /* need to setup SBE for FS */
       if (type == PIPE_SHADER_FRAGMENT && vec->dirty &
             (state | ILO_DIRTY_GS | ILO_DIRTY_VS | ILO_DIRTY_RASTERIZER)) {
-         if (ilo_shader_select_kernel_routing(shader,
+         if (ilo_shader_select_kernel_sbe(shader,
                (vec->gs) ? vec->gs : vec->vs, vec->rasterizer))
             vec->dirty |= state;
       }
@@ -97,7 +369,6 @@ finalize_cbuf_state(struct ilo_context *ilo,
       ~ilo_shader_get_kernel_param(sh, ILO_KERNEL_SKIP_CBUF0_UPLOAD);
 
    while (upload_mask) {
-      const enum pipe_format elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       unsigned offset, i;
 
       i = u_bit_scan(&upload_mask);
@@ -105,14 +376,16 @@ finalize_cbuf_state(struct ilo_context *ilo,
       if (cbuf->cso[i].resource)
          continue;
 
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].user_buffer_size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
-      ilo_gpe_init_view_surface_for_buffer(ilo->dev,
-            ilo_buffer(cbuf->cso[i].resource),
-            offset, cbuf->cso[i].user_buffer_size,
-            util_format_get_blocksize(elem_format), elem_format,
-            false, false, &cbuf->cso[i].surface);
+      cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource);
+      cbuf->cso[i].info.offset = offset;
+
+      memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface));
+      ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface,
+            ilo->dev, &cbuf->cso[i].info);
+      cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo;
 
       ilo->state_vector.dirty |= ILO_DIRTY_CBUF;
    }
@@ -133,114 +406,380 @@ finalize_constant_buffers(struct ilo_context *ilo)
 static void
 finalize_index_buffer(struct ilo_context *ilo)
 {
+   const struct ilo_dev *dev = ilo->dev;
    struct ilo_state_vector *vec = &ilo->state_vector;
    const bool need_upload = (vec->draw->indexed &&
-         (vec->ib.user_buffer || vec->ib.offset % vec->ib.index_size));
+         (vec->ib.state.user_buffer ||
+          vec->ib.state.offset % vec->ib.state.index_size));
    struct pipe_resource *current_hw_res = NULL;
+   struct ilo_state_index_buffer_info info;
+   int64_t vertex_start_bias = 0;
 
    if (!(vec->dirty & ILO_DIRTY_IB) && !need_upload)
       return;
 
+   /* make sure vec->ib.hw_resource changes when reallocated */
    pipe_resource_reference(&current_hw_res, vec->ib.hw_resource);
 
    if (need_upload) {
-      const unsigned offset = vec->ib.index_size * vec->draw->start;
-      const unsigned size = vec->ib.index_size * vec->draw->count;
+      const unsigned offset = vec->ib.state.index_size * vec->draw->start;
+      const unsigned size = vec->ib.state.index_size * vec->draw->count;
       unsigned hw_offset;
 
-      if (vec->ib.user_buffer) {
+      if (vec->ib.state.user_buffer) {
          u_upload_data(ilo->uploader, 0, size,
-               vec->ib.user_buffer + offset, &hw_offset, &vec->ib.hw_resource);
-      }
-      else {
-         u_upload_buffer(ilo->uploader, 0, vec->ib.offset + offset, size,
-               vec->ib.buffer, &hw_offset, &vec->ib.hw_resource);
+               vec->ib.state.user_buffer + offset,
+               &hw_offset, &vec->ib.hw_resource);
+      } else {
+         u_upload_buffer(ilo->uploader, 0,
+               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               &hw_offset, &vec->ib.hw_resource);
       }
 
       /* the HW offset should be aligned */
-      assert(hw_offset % vec->ib.index_size == 0);
-      vec->ib.draw_start_offset = hw_offset / vec->ib.index_size;
+      assert(hw_offset % vec->ib.state.index_size == 0);
+      vertex_start_bias = hw_offset / vec->ib.state.index_size;
 
       /*
        * INDEX[vec->draw->start] in the original buffer is INDEX[0] in the HW
        * resource
        */
-      vec->ib.draw_start_offset -= vec->draw->start;
-   }
-   else {
-      pipe_resource_reference(&vec->ib.hw_resource, vec->ib.buffer);
+      vertex_start_bias -= vec->draw->start;
+   } else {
+      pipe_resource_reference(&vec->ib.hw_resource, vec->ib.state.buffer);
 
       /* note that index size may be zero when the draw is not indexed */
       if (vec->draw->indexed)
-         vec->ib.draw_start_offset = vec->ib.offset / vec->ib.index_size;
-      else
-         vec->ib.draw_start_offset = 0;
+         vertex_start_bias = vec->ib.state.offset / vec->ib.state.index_size;
    }
 
+   vec->draw_info.vertex_start += vertex_start_bias;
+
    /* treat the IB as clean if the HW states do not change */
    if (vec->ib.hw_resource == current_hw_res &&
-       vec->ib.hw_index_size == vec->ib.index_size)
+       vec->ib.hw_index_size == vec->ib.state.index_size)
       vec->dirty &= ~ILO_DIRTY_IB;
    else
-      vec->ib.hw_index_size = vec->ib.index_size;
+      vec->ib.hw_index_size = vec->ib.state.index_size;
 
    pipe_resource_reference(&current_hw_res, NULL);
+
+   memset(&info, 0, sizeof(info));
+   if (vec->ib.hw_resource) {
+      info.buf = ilo_buffer(vec->ib.hw_resource);
+      info.size = info.buf->bo_size;
+      info.format = ilo_translate_index_size(vec->ib.hw_index_size);
+
+      vec->ib.ib.bo = info.buf->bo;
+   }
+
+   ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info);
 }
 
 static void
 finalize_vertex_elements(struct ilo_context *ilo)
 {
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_ve_state *ve = vec->ve;
+   const bool last_element_edge_flag = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG));
+   const bool prepend_vertexid = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_VERTEXID));
+   const bool prepend_instanceid = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs,
+            ILO_KERNEL_VS_INPUT_INSTANCEID));
+   const enum gen_index_format index_format = (vec->draw->indexed) ?
+      ilo_translate_index_size(vec->ib.state.index_size) : GEN6_INDEX_DWORD;
+
+   /* check for non-orthogonal states */
+   if (ve->vf_params.cv_topology != vec->draw_info.topology ||
+       ve->vf_params.prepend_vertexid != prepend_vertexid ||
+       ve->vf_params.prepend_instanceid != prepend_instanceid ||
+       ve->vf_params.last_element_edge_flag != last_element_edge_flag ||
+       ve->vf_params.cv_index_format != index_format ||
+       ve->vf_params.cut_index_enable != vec->draw->primitive_restart ||
+       ve->vf_params.cut_index != vec->draw->restart_index) {
+      ve->vf_params.cv_topology = vec->draw_info.topology;
+      ve->vf_params.prepend_vertexid = prepend_vertexid;
+      ve->vf_params.prepend_instanceid = prepend_instanceid;
+      ve->vf_params.last_element_edge_flag = last_element_edge_flag;
+      ve->vf_params.cv_index_format = index_format;
+      ve->vf_params.cut_index_enable = vec->draw->primitive_restart;
+      ve->vf_params.cut_index = vec->draw->restart_index;
+
+      ilo_state_vf_set_params(&ve->vf, dev, &ve->vf_params);
+
+      vec->dirty |= ILO_DIRTY_VE;
+   }
+}
+
+static void
+finalize_vertex_buffers(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
    struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_state_vertex_buffer_info info;
+   unsigned i;
 
-   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS)))
+   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VB)))
       return;
 
-   vec->dirty |= ILO_DIRTY_VE;
+   memset(&info, 0, sizeof(info));
+
+   for (i = 0; i < vec->ve->vb_count; i++) {
+      const unsigned pipe_idx = vec->ve->vb_mapping[i];
+      const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx];
+
+      if (cso->buffer) {
+         info.buf = ilo_buffer(cso->buffer);
+         info.offset = cso->buffer_offset;
+         info.size = info.buf->bo_size;
+
+         info.stride = cso->stride;
+
+         vec->vb.vb[i].bo = info.buf->bo;
+      } else {
+         memset(&info, 0, sizeof(info));
+      }
+
+      ilo_state_vertex_buffer_set_info(&vec->vb.vb[i], dev, &info);
+   }
+}
+
+static void
+finalize_urb(struct ilo_context *ilo)
+{
+   const uint16_t attr_size = sizeof(uint32_t) * 4;
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_state_urb_info info;
+
+   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS |
+                       ILO_DIRTY_GS | ILO_DIRTY_FS)))
+      return;
+
+   memset(&info, 0, sizeof(info));
+
+   info.ve_entry_size = attr_size * ilo_state_vf_get_attr_count(&vec->ve->vf);
+
+   if (vec->vs) {
+      info.vs_const_data = (bool)
+         (ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_PCB_CBUF0_SIZE) +
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_PCB_UCP_SIZE));
+      info.vs_entry_size = attr_size *
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT);
+   }
+
+   if (vec->gs) {
+      info.gs_const_data = (bool)
+         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_PCB_CBUF0_SIZE);
 
-   vec->ve->last_cso_edgeflag = false;
-   if (vec->ve->count && vec->vs &&
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG)) {
-      vec->ve->edgeflag_cso = vec->ve->cso[vec->ve->count - 1];
-      ilo_gpe_set_ve_edgeflag(ilo->dev, &vec->ve->edgeflag_cso);
-      vec->ve->last_cso_edgeflag = true;
-   }
-
-   vec->ve->prepend_nosrc_cso = false;
-   if (vec->vs &&
-       (ilo_shader_get_kernel_param(vec->vs,
-                                    ILO_KERNEL_VS_INPUT_INSTANCEID) ||
-        ilo_shader_get_kernel_param(vec->vs,
-                                    ILO_KERNEL_VS_INPUT_VERTEXID))) {
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_VID,
-            GEN6_VFCOMP_STORE_IID,
-            GEN6_VFCOMP_NOSTORE,
-            GEN6_VFCOMP_NOSTORE,
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
-   } else if (!vec->vs) {
-      /* generate VUE header */
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_0, /* Reserved */
-            GEN6_VFCOMP_STORE_0, /* Render Target Array Index */
-            GEN6_VFCOMP_STORE_0, /* Viewport Index */
-            GEN6_VFCOMP_STORE_0, /* Point Width */
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
-   } else if (!vec->ve->count) {
       /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 92:
+       * From the Ivy Bridge PRM, volume 2 part 1, page 189:
+       *
+       *     "All outputs of a GS thread will be stored in the single GS
+       *      thread output URB entry."
        *
-       *    "SW must ensure that at least one vertex element is defined prior
-       *     to issuing a 3DPRIMTIVE command, or operation is UNDEFINED."
+       * TODO
        */
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_1_FP,
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
+      info.gs_entry_size = attr_size *
+         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT);
+   }
+
+   if (vec->fs) {
+      info.ps_const_data = (bool)
+         ilo_shader_get_kernel_param(vec->fs, ILO_KERNEL_PCB_CBUF0_SIZE);
+   }
+
+   ilo_state_urb_set_info(&vec->urb, dev, &info);
+}
+
+static void
+finalize_viewport(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+
+   if (vec->dirty & ILO_DIRTY_VIEWPORT) {
+      ilo_state_viewport_set_params(&vec->viewport.vp,
+            dev, &vec->viewport.params, false);
+   } else if (vec->dirty & ILO_DIRTY_SCISSOR) {
+      ilo_state_viewport_set_params(&vec->viewport.vp,
+            dev, &vec->viewport.params, true);
+      vec->dirty |= ILO_DIRTY_VIEWPORT;
+   }
+}
+
+static bool
+can_enable_gb_test(const struct ilo_rasterizer_state *rasterizer,
+                   const struct ilo_viewport_state *viewport,
+                   const struct ilo_fb_state *fb)
+{
+   unsigned i;
+
+   /*
+    * There are several reasons that guard band test should be disabled
+    *
+    *  - GL wide points (to avoid partially visibie object)
+    *  - GL wide or AA lines (to avoid partially visibie object)
+    *  - missing 2D clipping
+    */
+   if (rasterizer->state.point_size_per_vertex ||
+       rasterizer->state.point_size > 1.0f ||
+       rasterizer->state.line_width > 1.0f ||
+       rasterizer->state.line_smooth)
+      return false;
+
+   for (i = 0; i < viewport->params.count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat =
+         &viewport->matrices[i];
+      float min_x, max_x, min_y, max_y;
+
+      min_x = -1.0f * fabsf(mat->scale[0]) + mat->translate[0];
+      max_x =  1.0f * fabsf(mat->scale[0]) + mat->translate[0];
+      min_y = -1.0f * fabsf(mat->scale[1]) + mat->translate[1];
+      max_y =  1.0f * fabsf(mat->scale[1]) + mat->translate[1];
+
+      if (min_x > 0.0f || max_x < fb->state.width ||
+          min_y > 0.0f || max_y < fb->state.height)
+         return false;
+   }
+
+   return true;
+}
+
+static void
+finalize_rasterizer(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_rasterizer_state *rasterizer = vec->rasterizer;
+   struct ilo_state_raster_info *info = &vec->rasterizer->info;
+   const bool gb_test_enable =
+      can_enable_gb_test(rasterizer, &vec->viewport, &vec->fb);
+   const bool multisample =
+      (rasterizer->state.multisample && vec->fb.num_samples > 1);
+   const uint8_t barycentric_interps = ilo_shader_get_kernel_param(vec->fs,
+         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS);
+
+   /* check for non-orthogonal states */
+   if (info->clip.viewport_count != vec->viewport.params.count ||
+       info->clip.gb_test_enable != gb_test_enable ||
+       info->setup.msaa_enable != multisample ||
+       info->setup.line_msaa_enable != multisample ||
+       info->tri.depth_offset_format != vec->fb.depth_offset_format ||
+       info->scan.sample_count != vec->fb.num_samples ||
+       info->scan.sample_mask != vec->sample_mask ||
+       info->scan.barycentric_interps != barycentric_interps ||
+       info->params.any_integer_rt != vec->fb.has_integer_rt ||
+       info->params.hiz_enable != vec->fb.has_hiz) {
+      info->clip.viewport_count = vec->viewport.params.count;
+      info->clip.gb_test_enable = gb_test_enable;
+      info->setup.msaa_enable = multisample;
+      info->setup.line_msaa_enable = multisample;
+      info->tri.depth_offset_format = vec->fb.depth_offset_format;
+      info->scan.sample_count = vec->fb.num_samples;
+      info->scan.sample_mask = vec->sample_mask;
+      info->scan.barycentric_interps = barycentric_interps;
+      info->params.any_integer_rt = vec->fb.has_integer_rt;
+      info->params.hiz_enable = vec->fb.has_hiz;
+
+      ilo_state_raster_set_info(&rasterizer->rs, dev, &rasterizer->info);
+
+      vec->dirty |= ILO_DIRTY_RASTERIZER;
+   }
+}
+
+static bool
+finalize_blend_rt(struct ilo_context *ilo)
+{
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   const struct ilo_fb_state *fb = &vec->fb;
+   struct ilo_blend_state *blend = vec->blend;
+   struct ilo_state_cc_blend_info *info = &vec->blend->info.blend;
+   bool changed = false;
+   unsigned i;
+
+   if (!(vec->dirty & (ILO_DIRTY_FB | ILO_DIRTY_BLEND)))
+      return false;
+
+   /* set up one for dummy RT writes */
+   if (!fb->state.nr_cbufs) {
+      if (info->rt != &blend->dummy_rt) {
+         info->rt = &blend->dummy_rt;
+         info->rt_count = 1;
+         changed = true;
+      }
+
+      return changed;
+   }
+
+   if (info->rt != blend->effective_rt ||
+       info->rt_count != fb->state.nr_cbufs) {
+      info->rt = blend->effective_rt;
+      info->rt_count = fb->state.nr_cbufs;
+      changed = true;
+   }
+
+   for (i = 0; i < fb->state.nr_cbufs; i++) {
+      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
+      struct ilo_state_cc_blend_rt_info *rt = &blend->effective_rt[i];
+      /* ignore logicop when not UNORM */
+      const bool logicop_enable =
+         (blend->rt[i].logicop_enable && caps->is_unorm);
+
+      if (rt->cv_is_unorm != caps->is_unorm ||
+          rt->cv_is_integer != caps->is_integer ||
+          rt->logicop_enable != logicop_enable ||
+          rt->force_dst_alpha_one != caps->force_dst_alpha_one) {
+         rt->cv_is_unorm = caps->is_unorm;
+         rt->cv_is_integer = caps->is_integer;
+         rt->logicop_enable = logicop_enable;
+         rt->force_dst_alpha_one = caps->force_dst_alpha_one;
+
+         changed = true;
+      }
+   }
+
+   return changed;
+}
+
+static void
+finalize_blend(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_blend_state *blend = vec->blend;
+   struct ilo_state_cc_info *info = &blend->info;
+   const bool sample_count_one = (vec->fb.num_samples <= 1);
+   const bool float_source0_alpha =
+      (!vec->fb.state.nr_cbufs || !vec->fb.state.cbufs[0] ||
+       !util_format_is_pure_integer(vec->fb.state.cbufs[0]->format));
+
+   /* check for non-orthogonal states */
+   if (finalize_blend_rt(ilo) ||
+       info->alpha.cv_sample_count_one != sample_count_one ||
+       info->alpha.cv_float_source0_alpha != float_source0_alpha ||
+       info->alpha.test_enable != vec->dsa->alpha_test ||
+       info->alpha.test_func != vec->dsa->alpha_func ||
+       memcmp(&info->stencil, &vec->dsa->stencil, sizeof(info->stencil)) ||
+       memcmp(&info->depth, &vec->dsa->depth, sizeof(info->depth)) ||
+       memcmp(&info->params, &vec->cc_params, sizeof(info->params))) {
+      info->alpha.cv_sample_count_one = sample_count_one;
+      info->alpha.cv_float_source0_alpha = float_source0_alpha;
+      info->alpha.test_enable = vec->dsa->alpha_test;
+      info->alpha.test_func = vec->dsa->alpha_func;
+      info->stencil = vec->dsa->stencil;
+      info->depth = vec->dsa->depth;
+      info->params = vec->cc_params;
+
+      ilo_state_cc_set_info(&blend->cc, dev, info);
+
+      blend->alpha_may_kill = (info->alpha.alpha_to_coverage ||
+                               info->alpha.test_enable);
+
+      vec->dirty |= ILO_DIRTY_BLEND;
    }
 }
 
@@ -254,10 +793,24 @@ ilo_finalize_3d_states(struct ilo_context *ilo,
 {
    ilo->state_vector.draw = draw;
 
+   ilo->state_vector.draw_info.topology = ilo_translate_draw_mode(draw->mode);
+   ilo->state_vector.draw_info.indexed = draw->indexed;
+   ilo->state_vector.draw_info.vertex_count = draw->count;
+   ilo->state_vector.draw_info.vertex_start = draw->start;
+   ilo->state_vector.draw_info.instance_count = draw->instance_count;
+   ilo->state_vector.draw_info.instance_start = draw->start_instance;
+   ilo->state_vector.draw_info.vertex_base = draw->index_bias;
+
+   finalize_blend(ilo);
    finalize_shader_states(&ilo->state_vector);
    finalize_constant_buffers(ilo);
    finalize_index_buffer(ilo);
    finalize_vertex_elements(ilo);
+   finalize_vertex_buffers(ilo);
+
+   finalize_urb(ilo);
+   finalize_rasterizer(ilo);
+   finalize_viewport(ilo);
 
    u_upload_unmap(ilo->uploader);
 }
@@ -301,12 +854,79 @@ ilo_create_blend_state(struct pipe_context *pipe,
                        const struct pipe_blend_state *state)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_state_cc_info *info;
    struct ilo_blend_state *blend;
+   int i;
 
-   blend = MALLOC_STRUCT(ilo_blend_state);
+   blend = CALLOC_STRUCT(ilo_blend_state);
    assert(blend);
 
-   ilo_gpe_init_blend(dev, state, blend);
+   info = &blend->info;
+
+   info->alpha.cv_float_source0_alpha = true;
+   info->alpha.cv_sample_count_one = true;
+   info->alpha.alpha_to_one = state->alpha_to_one;
+   info->alpha.alpha_to_coverage = state->alpha_to_coverage;
+   info->alpha.test_enable = false;
+   info->alpha.test_func = GEN6_COMPAREFUNCTION_ALWAYS;
+
+   info->stencil.cv_has_buffer = true;
+   info->depth.cv_has_buffer= true;
+
+   info->blend.rt = blend->effective_rt;
+   info->blend.rt_count = 1;
+   info->blend.dither_enable = state->dither;
+
+   for (i = 0; i < ARRAY_SIZE(blend->rt); i++) {
+      const struct pipe_rt_blend_state *rt = &state->rt[i];
+      struct ilo_state_cc_blend_rt_info *rt_info = &blend->rt[i];
+
+      rt_info->cv_has_buffer = true;
+      rt_info->cv_is_unorm = true;
+      rt_info->cv_is_integer = false;
+
+      /* logic op takes precedence over blending */
+      if (state->logicop_enable) {
+         rt_info->logicop_enable = true;
+         rt_info->logicop_func = ilo_translate_logicop(state->logicop_func);
+      } else if (rt->blend_enable) {
+         rt_info->blend_enable = true;
+
+         rt_info->rgb_src = ilo_translate_blend_factor(rt->rgb_src_factor);
+         rt_info->rgb_dst = ilo_translate_blend_factor(rt->rgb_dst_factor);
+         rt_info->rgb_func = ilo_translate_blend_func(rt->rgb_func);
+
+         rt_info->a_src = ilo_translate_blend_factor(rt->alpha_src_factor);
+         rt_info->a_dst = ilo_translate_blend_factor(rt->alpha_dst_factor);
+         rt_info->a_func = ilo_translate_blend_func(rt->alpha_func);
+      }
+
+      if (!(rt->colormask & PIPE_MASK_A))
+         rt_info->argb_write_disables |= (1 << 3);
+      if (!(rt->colormask & PIPE_MASK_R))
+         rt_info->argb_write_disables |= (1 << 2);
+      if (!(rt->colormask & PIPE_MASK_G))
+         rt_info->argb_write_disables |= (1 << 1);
+      if (!(rt->colormask & PIPE_MASK_B))
+         rt_info->argb_write_disables |= (1 << 0);
+
+      if (!state->independent_blend_enable) {
+         for (i = 1; i < ARRAY_SIZE(blend->rt); i++)
+            blend->rt[i] = *rt_info;
+         break;
+      }
+   }
+
+   memcpy(blend->effective_rt, blend->rt, sizeof(blend->rt));
+
+   blend->dummy_rt.argb_write_disables = 0xf;
+
+   if (!ilo_state_cc_init(&blend->cc, dev, &blend->info)) {
+      FREE(blend);
+      return NULL;
+   }
+
+   blend->dual_blend = util_blend_state_is_dual(state, 0);
 
    return blend;
 }
@@ -333,11 +953,105 @@ ilo_create_sampler_state(struct pipe_context *pipe,
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_sampler_cso *sampler;
+   struct ilo_state_sampler_info info;
+   struct ilo_state_sampler_border_info border;
 
-   sampler = MALLOC_STRUCT(ilo_sampler_cso);
+   sampler = CALLOC_STRUCT(ilo_sampler_cso);
    assert(sampler);
 
-   ilo_gpe_init_sampler_cso(dev, state, sampler);
+   memset(&info, 0, sizeof(info));
+
+   info.non_normalized = !state->normalized_coords;
+   if (state->normalized_coords) {
+      info.lod_bias = state->lod_bias;
+      info.min_lod = state->min_lod;
+      info.max_lod = state->max_lod;
+
+      info.mip_filter = ilo_translate_mip_filter(state->min_mip_filter);
+   } else {
+      /* work around a bug in util_blitter */
+      info.mip_filter = GEN6_MIPFILTER_NONE;
+   }
+
+   if (state->max_anisotropy) {
+      info.min_filter = GEN6_MAPFILTER_ANISOTROPIC;
+      info.mag_filter = GEN6_MAPFILTER_ANISOTROPIC;
+   } else {
+      info.min_filter = ilo_translate_img_filter(state->min_img_filter);
+      info.mag_filter = ilo_translate_img_filter(state->mag_img_filter);
+   }
+
+   info.max_anisotropy = ilo_translate_max_anisotropy(state->max_anisotropy);
+
+   /* use LOD 0 when no mipmapping (see sampler_set_gen6_SAMPLER_STATE()) */
+   if (info.mip_filter == GEN6_MIPFILTER_NONE && info.min_lod > 0.0f) {
+      info.min_lod = 0.0f;
+      info.mag_filter = info.min_filter;
+   }
+
+   if (state->seamless_cube_map) {
+      if (state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+          state->mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+         info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+      } else {
+         info.tcx_ctrl = GEN6_TEXCOORDMODE_CUBE;
+         info.tcy_ctrl = GEN6_TEXCOORDMODE_CUBE;
+         info.tcz_ctrl = GEN6_TEXCOORDMODE_CUBE;
+      }
+   } else {
+      info.tcx_ctrl = ilo_translate_address_wrap(state->wrap_s);
+      info.tcy_ctrl = ilo_translate_address_wrap(state->wrap_t);
+      info.tcz_ctrl = ilo_translate_address_wrap(state->wrap_r);
+
+      if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+         /*
+          * For nearest filtering, PIPE_TEX_WRAP_CLAMP means
+          * PIPE_TEX_WRAP_CLAMP_TO_EDGE;  for linear filtering,
+          * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while
+          * additionally clamping the texture coordinates to [0.0, 1.0].
+          *
+          * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8.  The
+          * clamping has to be taken care of in the shaders.  There are two
+          * filters here, but let the minification one has a say.
+          */
+         const bool clamp_is_to_edge =
+            (state->min_img_filter == PIPE_TEX_FILTER_NEAREST);
+
+         if (clamp_is_to_edge) {
+            if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+            if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+            if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         } else {
+            if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_s = true;
+            }
+            if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_t = true;
+            }
+            if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_r = true;
+            }
+         }
+      }
+   }
+
+   if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+      info.shadow_func = ilo_translate_shadow_func(state->compare_func);
+
+   ilo_state_sampler_init(&sampler->sampler, dev, &info);
+
+   memset(&border, 0, sizeof(border));
+   memcpy(border.rgba.f, state->border_color.f, sizeof(border.rgba.f));
+
+   ilo_state_sampler_border_init(&sampler->border, dev, &border);
 
    return sampler;
 }
@@ -403,12 +1117,74 @@ ilo_create_rasterizer_state(struct pipe_context *pipe,
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_rasterizer_state *rast;
+   struct ilo_state_raster_info *info;
 
-   rast = MALLOC_STRUCT(ilo_rasterizer_state);
+   rast = CALLOC_STRUCT(ilo_rasterizer_state);
    assert(rast);
 
    rast->state = *state;
-   ilo_gpe_init_rasterizer(dev, state, rast);
+
+   info = &rast->info;
+
+   info->clip.clip_enable = true;
+   info->clip.stats_enable = true;
+   info->clip.viewport_count = 1;
+   info->clip.force_rtaindex_zero = true;
+   info->clip.user_clip_enables = state->clip_plane_enable;
+   info->clip.gb_test_enable = true;
+   info->clip.xy_test_enable = true;
+   info->clip.z_far_enable = state->depth_clip;
+   info->clip.z_near_enable = state->depth_clip;
+   info->clip.z_near_zero = state->clip_halfz;
+
+   info->setup.first_vertex_provoking = state->flatshade_first;
+   info->setup.viewport_transform = true;
+   info->setup.scissor_enable = state->scissor;
+   info->setup.msaa_enable = false;
+   info->setup.line_msaa_enable = false;
+   info->point.aa_enable = state->point_smooth;
+   info->point.programmable_width = state->point_size_per_vertex;
+   info->line.aa_enable = state->line_smooth;
+   info->line.stipple_enable = state->line_stipple_enable;
+   info->line.giq_enable = true;
+   info->line.giq_last_pixel = state->line_last_pixel;
+   info->tri.front_winding = ilo_translate_front_ccw(state->front_ccw);
+   info->tri.cull_mode = ilo_translate_cull_face(state->cull_face);
+   info->tri.fill_mode_front = ilo_translate_poly_mode(state->fill_front);
+   info->tri.fill_mode_back = ilo_translate_poly_mode(state->fill_back);
+   info->tri.depth_offset_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+   info->tri.depth_offset_solid = state->offset_tri;
+   info->tri.depth_offset_wireframe = state->offset_line;
+   info->tri.depth_offset_point = state->offset_point;
+   info->tri.poly_stipple_enable = state->poly_stipple_enable;
+
+   info->scan.stats_enable = true;
+   info->scan.sample_count = 1;
+   info->scan.pixloc =
+      ilo_translate_half_pixel_center(state->half_pixel_center);
+   info->scan.sample_mask = ~0u;
+   info->scan.zw_interp = GEN6_ZW_INTERP_PIXEL;
+   info->scan.barycentric_interps = GEN6_INTERP_PERSPECTIVE_PIXEL;
+   info->scan.earlyz_control = GEN7_EDSC_NORMAL;
+   info->scan.earlyz_op = ILO_STATE_RASTER_EARLYZ_NORMAL;
+   info->scan.earlyz_stencil_clear = false;
+
+   info->params.any_integer_rt = false;
+   info->params.hiz_enable = true;
+   info->params.point_width =
+      (state->point_size == 0.0f) ? 1.0f : state->point_size;
+   info->params.line_width =
+      (state->line_width == 0.0f) ? 1.0f : state->line_width;
+
+   info->params.depth_offset_scale = state->offset_scale;
+   /*
+    * Scale the constant term.  The minimum representable value used by the HW
+    * is not large enouch to be the minimum resolvable difference.
+    */
+   info->params.depth_offset_const = state->offset_units * 2.0f;
+   info->params.depth_offset_clamp = state->offset_clamp;
+
+   ilo_state_raster_init(&rast->rs, dev, info);
 
    return rast;
 }
@@ -416,10 +1192,20 @@ ilo_create_rasterizer_state(struct pipe_context *pipe,
 static void
 ilo_bind_rasterizer_state(struct pipe_context *pipe, void *state)
 {
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    vec->rasterizer = state;
 
+   if (vec->rasterizer) {
+      struct ilo_state_line_stipple_info info;
+
+      info.pattern = vec->rasterizer->state.line_stipple_pattern;
+      info.repeat_count = vec->rasterizer->state.line_stipple_factor + 1;
+
+      ilo_state_line_stipple_set_info(&vec->line_stipple, dev, &info);
+   }
+
    vec->dirty |= ILO_DIRTY_RASTERIZER;
 }
 
@@ -433,13 +1219,48 @@ static void *
 ilo_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                                      const struct pipe_depth_stencil_alpha_state *state)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_dsa_state *dsa;
+   int i;
 
-   dsa = MALLOC_STRUCT(ilo_dsa_state);
+   dsa = CALLOC_STRUCT(ilo_dsa_state);
    assert(dsa);
 
-   ilo_gpe_init_dsa(dev, state, dsa);
+   dsa->depth.cv_has_buffer = true;
+   dsa->depth.test_enable = state->depth.enabled;
+   dsa->depth.write_enable = state->depth.writemask;
+   dsa->depth.test_func = ilo_translate_compare_func(state->depth.func);
+
+   dsa->stencil.cv_has_buffer = true;
+   for (i = 0; i < ARRAY_SIZE(state->stencil); i++) {
+      const struct pipe_stencil_state *stencil = &state->stencil[i];
+      struct ilo_state_cc_stencil_op_info *op;
+
+      if (!stencil->enabled)
+         break;
+
+      if (i == 0) {
+         dsa->stencil.test_enable = true;
+         dsa->stencil_front.test_mask = stencil->valuemask;
+         dsa->stencil_front.write_mask = stencil->writemask;
+
+         op = &dsa->stencil.front;
+      } else {
+         dsa->stencil.twosided_enable = true;
+         dsa->stencil_back.test_mask = stencil->valuemask;
+         dsa->stencil_back.write_mask = stencil->writemask;
+
+         op = &dsa->stencil.back;
+      }
+
+      op->test_func = ilo_translate_compare_func(stencil->func);
+      op->fail_op = ilo_translate_stencil_op(stencil->fail_op);
+      op->zfail_op = ilo_translate_stencil_op(stencil->zfail_op);
+      op->zpass_op = ilo_translate_stencil_op(stencil->zpass_op);
+   }
+
+   dsa->alpha_test = state->alpha.enabled;
+   dsa->alpha_ref = state->alpha.ref_value;
+   dsa->alpha_func = ilo_translate_compare_func(state->alpha.func);
 
    return dsa;
 }
@@ -450,6 +1271,17 @@ ilo_bind_depth_stencil_alpha_state(struct pipe_context *pipe, void *state)
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    vec->dsa = state;
+   if (vec->dsa) {
+      vec->cc_params.alpha_ref = vec->dsa->alpha_ref;
+      vec->cc_params.stencil_front.test_mask =
+         vec->dsa->stencil_front.test_mask;
+      vec->cc_params.stencil_front.write_mask =
+         vec->dsa->stencil_front.write_mask;
+      vec->cc_params.stencil_back.test_mask =
+         vec->dsa->stencil_back.test_mask;
+      vec->cc_params.stencil_back.write_mask =
+         vec->dsa->stencil_back.write_mask;
+   }
 
    vec->dirty |= ILO_DIRTY_DSA;
 }
@@ -575,12 +1407,60 @@ ilo_create_vertex_elements_state(struct pipe_context *pipe,
                                  const struct pipe_vertex_element *elements)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_state_vf_element_info vf_elements[PIPE_MAX_ATTRIBS];
+   unsigned instance_divisors[PIPE_MAX_ATTRIBS];
+   struct ilo_state_vf_info vf_info;
    struct ilo_ve_state *ve;
+   unsigned i;
 
-   ve = MALLOC_STRUCT(ilo_ve_state);
+   ve = CALLOC_STRUCT(ilo_ve_state);
    assert(ve);
 
-   ilo_gpe_init_ve(dev, num_elements, elements, ve);
+   for (i = 0; i < num_elements; i++) {
+      const struct pipe_vertex_element *elem = &elements[i];
+      struct ilo_state_vf_element_info *attr = &vf_elements[i];
+      unsigned hw_idx;
+
+      /*
+       * map the pipe vb to the hardware vb, which has a fixed instance
+       * divisor
+       */
+      for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
+         if (ve->vb_mapping[hw_idx] == elem->vertex_buffer_index &&
+             instance_divisors[hw_idx] == elem->instance_divisor)
+            break;
+      }
+
+      /* create one if there is no matching hardware vb */
+      if (hw_idx >= ve->vb_count) {
+         hw_idx = ve->vb_count++;
+
+         ve->vb_mapping[hw_idx] = elem->vertex_buffer_index;
+         instance_divisors[hw_idx] = elem->instance_divisor;
+      }
+
+      attr->buffer = hw_idx;
+      attr->vertex_offset = elem->src_offset;
+      attr->format = ilo_format_translate_vertex(dev, elem->src_format);
+      attr->format_size = util_format_get_blocksize(elem->src_format);
+      attr->component_count = util_format_get_nr_components(elem->src_format);
+      attr->is_integer = util_format_is_pure_integer(elem->src_format);
+
+      attr->instancing_enable = (elem->instance_divisor != 0);
+      attr->instancing_step_rate = elem->instance_divisor;
+   }
+
+   memset(&vf_info, 0, sizeof(vf_info));
+   vf_info.data = ve->vf_data;
+   vf_info.data_size = sizeof(ve->vf_data);
+   vf_info.elements = vf_elements;
+   vf_info.element_count = num_elements;
+   /* vf_info.params and ve->vf_params are both zeroed */
+
+   if (!ilo_state_vf_init(&ve->vf, dev, &vf_info)) {
+      FREE(ve);
+      return NULL;
+   }
 
    return ve;
 }
@@ -609,7 +1489,7 @@ ilo_set_blend_color(struct pipe_context *pipe,
 {
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
-   vec->blend_color = *state;
+   memcpy(vec->cc_params.blend_rgba, state->color, sizeof(state->color));
 
    vec->dirty |= ILO_DIRTY_BLEND_COLOR;
 }
@@ -626,6 +1506,9 @@ ilo_set_stencil_ref(struct pipe_context *pipe,
 
    vec->stencil_ref = *state;
 
+   vec->cc_params.stencil_front.test_ref = state->ref_value[0];
+   vec->cc_params.stencil_back.test_ref = state->ref_value[1];
+
    vec->dirty |= ILO_DIRTY_STENCIL_REF;
 }
 
@@ -675,47 +1558,47 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 
          pipe_resource_reference(&cso->resource, buf[i].buffer);
 
+         cso->info.access = ILO_STATE_SURFACE_ACCESS_DP_DATA;
+         cso->info.format = GEN6_FORMAT_R32G32B32A32_FLOAT;
+         cso->info.format_size = 16;
+         cso->info.struct_size = 16;
+         cso->info.readonly = true;
+         cso->info.size = buf[i].buffer_size;
+
          if (buf[i].buffer) {
-            const enum pipe_format elem_format =
-               PIPE_FORMAT_R32G32B32A32_FLOAT;
+            cso->info.buf = ilo_buffer(buf[i].buffer);
+            cso->info.offset = buf[i].buffer_offset;
 
-            ilo_gpe_init_view_surface_for_buffer(dev,
-                  ilo_buffer(buf[i].buffer),
-                  buf[i].buffer_offset, buf[i].buffer_size,
-                  util_format_get_blocksize(elem_format), elem_format,
-                  false, false, &cso->surface);
+            memset(&cso->surface, 0, sizeof(cso->surface));
+            ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info);
+            cso->surface.bo = cso->info.buf->bo;
 
             cso->user_buffer = NULL;
-            cso->user_buffer_size = 0;
 
             cbuf->enabled_mask |= 1 << (index + i);
-         }
-         else if (buf[i].user_buffer) {
-            cso->surface.bo = NULL;
-
+         } else if (buf[i].user_buffer) {
+            cso->info.buf = NULL;
             /* buffer_offset does not apply for user buffer */
             cso->user_buffer = buf[i].user_buffer;
-            cso->user_buffer_size = buf[i].buffer_size;
 
             cbuf->enabled_mask |= 1 << (index + i);
-         }
-         else {
-            cso->surface.bo = NULL;
+         } else {
+            cso->info.buf = NULL;
+            cso->info.size = 0;
             cso->user_buffer = NULL;
-            cso->user_buffer_size = 0;
 
             cbuf->enabled_mask &= ~(1 << (index + i));
          }
       }
-   }
-   else {
+   } else {
       for (i = 0; i < count; i++) {
          struct ilo_cbuf_cso *cso = &cbuf->cso[index + i];
 
          pipe_resource_reference(&cso->resource, NULL);
-         cso->surface.bo = NULL;
+
+         cso->info.buf = NULL;
+         cso->info.size = 0;
          cso->user_buffer = NULL;
-         cso->user_buffer_size = 0;
 
          cbuf->enabled_mask &= ~(1 << (index + i));
       }
@@ -725,13 +1608,116 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 }
 
 static void
+fb_set_blend_caps(const struct ilo_dev *dev,
+                  enum pipe_format format,
+                  struct ilo_fb_blend_caps *caps)
+{
+   const struct util_format_description *desc =
+      util_format_description(format);
+   const int ch = util_format_get_first_non_void_channel(format);
+
+   memset(caps, 0, sizeof(*caps));
+
+   if (format == PIPE_FORMAT_NONE || desc->is_mixed)
+      return;
+
+   caps->is_unorm = (ch >= 0 && desc->channel[ch].normalized &&
+         desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
+   caps->is_integer = util_format_is_pure_integer(format);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
+    *      variants), otherwise Logic Ops must be DISABLED."
+    *
+    * According to the classic driver, this is lifted on Gen8+.
+    */
+   caps->can_logicop = (ilo_dev_gen(dev) >= ILO_GEN(8) || caps->is_unorm);
+
+   /* no blending for pure integer formats */
+   caps->can_blend = !caps->is_integer;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
+    *
+    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
+    *      alpha value."
+    */
+   caps->can_alpha_test = !caps->is_integer;
+
+   caps->force_dst_alpha_one =
+      (ilo_format_translate_render(dev, format) !=
+       ilo_format_translate_color(dev, format));
+
+   /* sanity check */
+   if (caps->force_dst_alpha_one) {
+      enum pipe_format render_format;
+
+      switch (format) {
+      case PIPE_FORMAT_B8G8R8X8_UNORM:
+         render_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+         break;
+      default:
+         render_format = PIPE_FORMAT_NONE;
+         break;
+      }
+
+      assert(ilo_format_translate_render(dev, format) ==
+             ilo_format_translate_color(dev, render_format));
+   }
+}
+
+static void
 ilo_set_framebuffer_state(struct pipe_context *pipe,
                           const struct pipe_framebuffer_state *state)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   struct ilo_fb_state *fb = &vec->fb;
+   const struct pipe_surface *first_surf = NULL;
+   int i;
+
+   util_copy_framebuffer_state(&fb->state, state);
+
+   fb->has_integer_rt = false;
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (state->cbufs[i]) {
+         fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]);
 
-   ilo_gpe_set_fb(dev, state, &vec->fb);
+         fb->has_integer_rt |= fb->blend_caps[i].is_integer;
+
+         if (!first_surf)
+            first_surf = state->cbufs[i];
+      } else {
+         fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]);
+      }
+   }
+
+   if (!first_surf && state->zsbuf)
+      first_surf = state->zsbuf;
+
+   fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1;
+   if (!fb->num_samples)
+      fb->num_samples = 1;
+
+   if (state->zsbuf) {
+      const struct ilo_surface_cso *cso =
+         (const struct ilo_surface_cso *) state->zsbuf;
+
+      fb->has_hiz = cso->u.zs.hiz_bo;
+      fb->depth_offset_format =
+         ilo_state_zs_get_depth_format(&cso->u.zs, dev);
+   } else {
+      fb->has_hiz = false;
+      fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT;
+   }
+
+   /*
+    * The PRMs list several restrictions when the framebuffer has more than
+    * one surface.  It seems they are actually lifted on GEN6+.
+    */
 
    vec->dirty |= ILO_DIRTY_FB;
 }
@@ -740,9 +1726,15 @@ static void
 ilo_set_polygon_stipple(struct pipe_context *pipe,
                         const struct pipe_poly_stipple *state)
 {
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   struct ilo_state_poly_stipple_info info;
+   int i;
+
+   for (i = 0; i < 32; i++)
+      info.pattern[i] = state->stipple[i];
 
-   vec->poly_stipple = *state;
+   ilo_state_poly_stipple_set_info(&vec->poly_stipple, dev, &info);
 
    vec->dirty |= ILO_DIRTY_POLY_STIPPLE;
 }
@@ -753,11 +1745,26 @@ ilo_set_scissor_states(struct pipe_context *pipe,
                        unsigned num_scissors,
                        const struct pipe_scissor_state *scissors)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   unsigned i;
+
+   for (i = 0; i < num_scissors; i++) {
+      struct ilo_state_viewport_scissor_info *info =
+         &vec->viewport.scissors[start_slot + i];
 
-   ilo_gpe_set_scissor(dev, start_slot, num_scissors,
-         scissors, &vec->scissor);
+      if (scissors[i].minx < scissors[i].maxx &&
+          scissors[i].miny < scissors[i].maxy) {
+         info->min_x = scissors[i].minx;
+         info->min_y = scissors[i].miny;
+         info->max_x = scissors[i].maxx - 1;
+         info->max_y = scissors[i].maxy - 1;
+      } else {
+         info->min_x = 1;
+         info->min_y = 1;
+         info->max_x = 0;
+         info->max_y = 0;
+      }
+   }
 
    vec->dirty |= ILO_DIRTY_SCISSOR;
 }
@@ -768,28 +1775,31 @@ ilo_set_viewport_states(struct pipe_context *pipe,
                         unsigned num_viewports,
                         const struct pipe_viewport_state *viewports)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    if (viewports) {
       unsigned i;
 
       for (i = 0; i < num_viewports; i++) {
-         ilo_gpe_set_viewport_cso(dev, &viewports[i],
-               &vec->viewport.cso[start_slot + i]);
+         struct ilo_state_viewport_matrix_info *info =
+            &vec->viewport.matrices[start_slot + i];
+
+         memcpy(info->scale, viewports[i].scale, sizeof(info->scale));
+         memcpy(info->translate, viewports[i].translate,
+               sizeof(info->translate));
       }
 
-      if (vec->viewport.count < start_slot + num_viewports)
-         vec->viewport.count = start_slot + num_viewports;
+      if (vec->viewport.params.count < start_slot + num_viewports)
+         vec->viewport.params.count = start_slot + num_viewports;
 
       /* need to save viewport 0 for util_blitter */
       if (!start_slot && num_viewports)
          vec->viewport.viewport0 = viewports[0];
    }
    else {
-      if (vec->viewport.count <= start_slot + num_viewports &&
-          vec->viewport.count > start_slot)
-         vec->viewport.count = start_slot;
+      if (vec->viewport.params.count <= start_slot + num_viewports &&
+          vec->viewport.params.count > start_slot)
+         vec->viewport.params.count = start_slot;
    }
 
    vec->dirty |= ILO_DIRTY_VIEWPORT;
@@ -905,16 +1915,11 @@ ilo_set_index_buffer(struct pipe_context *pipe,
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    if (state) {
-      pipe_resource_reference(&vec->ib.buffer, state->buffer);
-      vec->ib.user_buffer = state->user_buffer;
-      vec->ib.offset = state->offset;
-      vec->ib.index_size = state->index_size;
-   }
-   else {
-      pipe_resource_reference(&vec->ib.buffer, NULL);
-      vec->ib.user_buffer = NULL;
-      vec->ib.offset = 0;
-      vec->ib.index_size = 0;
+      pipe_resource_reference(&vec->ib.state.buffer, state->buffer);
+      vec->ib.state = *state;
+   } else {
+      pipe_resource_reference(&vec->ib.state.buffer, NULL);
+      memset(&vec->ib.state, 0, sizeof(vec->ib.state));
    }
 
    vec->dirty |= ILO_DIRTY_IB;
@@ -926,19 +1931,28 @@ ilo_create_stream_output_target(struct pipe_context *pipe,
                                 unsigned buffer_offset,
                                 unsigned buffer_size)
 {
-   struct pipe_stream_output_target *target;
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_stream_output_target *target;
+   struct ilo_state_sol_buffer_info info;
 
-   target = MALLOC_STRUCT(pipe_stream_output_target);
+   target = CALLOC_STRUCT(ilo_stream_output_target);
    assert(target);
 
-   pipe_reference_init(&target->reference, 1);
-   target->buffer = NULL;
-   pipe_resource_reference(&target->buffer, res);
-   target->context = pipe;
-   target->buffer_offset = buffer_offset;
-   target->buffer_size = buffer_size;
+   pipe_reference_init(&target->base.reference, 1);
+   pipe_resource_reference(&target->base.buffer, res);
+   target->base.context = pipe;
+   target->base.buffer_offset = buffer_offset;
+   target->base.buffer_size = buffer_size;
+
+   memset(&info, 0, sizeof(info));
+   info.buf = ilo_buffer(res);
+   info.offset = buffer_offset;
+   info.size = buffer_size;
 
-   return target;
+   ilo_state_sol_buffer_init(&target->sb, dev, &info);
+   target->sb.bo = info.buf->bo;
+
+   return &target->base;
 }
 
 static void
@@ -991,7 +2005,7 @@ ilo_create_sampler_view(struct pipe_context *pipe,
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_view_cso *view;
 
-   view = MALLOC_STRUCT(ilo_view_cso);
+   view = CALLOC_STRUCT(ilo_view_cso);
    assert(view);
 
    view->base = *templ;
@@ -1001,16 +2015,24 @@ ilo_create_sampler_view(struct pipe_context *pipe,
    view->base.context = pipe;
 
    if (res->target == PIPE_BUFFER) {
-      const unsigned elem_size = util_format_get_blocksize(templ->format);
-      const unsigned first_elem = templ->u.buf.first_element;
-      const unsigned num_elems = templ->u.buf.last_element - first_elem + 1;
-
-      ilo_gpe_init_view_surface_for_buffer(dev, ilo_buffer(res),
-            first_elem * elem_size, num_elems * elem_size,
-            elem_size, templ->format, false, false, &view->surface);
-   }
-   else {
+      struct ilo_state_surface_buffer_info info;
+
+      memset(&info, 0, sizeof(info));
+      info.buf = ilo_buffer(res);
+      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+      info.format = ilo_format_translate_color(dev, templ->format);
+      info.format_size = util_format_get_blocksize(templ->format);
+      info.struct_size = info.format_size;
+      info.readonly = true;
+      info.offset = templ->u.buf.first_element * info.struct_size;
+      info.size = (templ->u.buf.last_element -
+            templ->u.buf.first_element + 1) * info.struct_size;
+
+      ilo_state_surface_init_for_buffer(&view->surface, dev, &info);
+      view->surface.bo = info.buf->bo;
+   } else {
       struct ilo_texture *tex = ilo_texture(res);
+      struct ilo_state_surface_image_info info;
 
       /* warn about degraded performance because of a missing binding flag */
       if (tex->image.tiling == GEN6_TILING_NONE &&
@@ -1019,13 +2041,33 @@ ilo_create_sampler_view(struct pipe_context *pipe,
                   "not created for sampling\n");
       }
 
-      ilo_gpe_init_view_surface_for_image(dev, &tex->image,
-            tex->base.target, templ->format,
-            templ->u.tex.first_level,
-            templ->u.tex.last_level - templ->u.tex.first_level + 1,
-            templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            false, &view->surface);
+      memset(&info, 0, sizeof(info));
+      info.img = &tex->image;
+
+      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+
+      if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+          tex->image.separate_stencil) {
+         info.format = ilo_format_translate_texture(dev,
+               PIPE_FORMAT_Z32_FLOAT);
+      } else {
+         info.format = ilo_format_translate_texture(dev, templ->format);
+      }
+
+      info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE ||
+                          tex->image.target == PIPE_TEXTURE_CUBE_ARRAY);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+      info.readonly = true;
+
+      info.level_base = templ->u.tex.first_level;
+      info.level_count = templ->u.tex.last_level -
+         templ->u.tex.first_level + 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_surface_init_for_image(&view->surface, dev, &info);
+      view->surface.bo = info.img->bo;
    }
 
    return &view->base;
@@ -1048,7 +2090,7 @@ ilo_create_surface(struct pipe_context *pipe,
    struct ilo_texture *tex = ilo_texture(res);
    struct ilo_surface_cso *surf;
 
-   surf = MALLOC_STRUCT(ilo_surface_cso);
+   surf = CALLOC_STRUCT(ilo_surface_cso);
    assert(surf);
 
    surf->base = *templ;
@@ -1063,28 +2105,56 @@ ilo_create_surface(struct pipe_context *pipe,
    surf->is_rt = !util_format_is_depth_or_stencil(templ->format);
 
    if (surf->is_rt) {
+      struct ilo_state_surface_image_info info;
+
       /* relax this? */
       assert(tex->base.target != PIPE_BUFFER);
 
-      /*
-       * classic i965 sets render_cache_rw for constant buffers and sol
-       * surfaces but not render buffers.  Why?
-       */
-      ilo_gpe_init_view_surface_for_image(dev,
-            &tex->image, tex->base.target,
-            templ->format, templ->u.tex.level, 1,
-            templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            true, &surf->u.rt);
+      memset(&info, 0, sizeof(info));
+      info.img = &tex->image;
+      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+      info.format = ilo_format_translate_render(dev, templ->format);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+      info.level_base = templ->u.tex.level;
+      info.level_count = 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_surface_init_for_image(&surf->u.rt, dev, &info);
+      surf->u.rt.bo = info.img->bo;
    } else {
+      struct ilo_state_zs_info info;
+
       assert(res->target != PIPE_BUFFER);
 
-      ilo_gpe_init_zs_surface(dev, &tex->image,
-            (tex->separate_s8) ? &tex->separate_s8->image : NULL,
-            tex->base.target, templ->format,
-            templ->u.tex.level, templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            &surf->u.zs);
+      memset(&info, 0, sizeof(info));
+
+      if (templ->format == PIPE_FORMAT_S8_UINT) {
+         info.s_img = &tex->image;
+      } else {
+         info.z_img = &tex->image;
+         info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL;
+
+         info.hiz_enable =
+            ilo_image_can_enable_aux(&tex->image, templ->u.tex.level);
+      }
+
+      info.level = templ->u.tex.level;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_zs_init(&surf->u.zs, dev, &info);
+
+      if (info.z_img) {
+         surf->u.zs.depth_bo = info.z_img->bo;
+         if (info.hiz_enable)
+            surf->u.zs.hiz_bo = info.z_img->aux.bo;
+      }
+
+      if (info.s_img)
+         surf->u.zs.stencil_bo = info.s_img->bo;
    }
 
    return &surf->base;
@@ -1294,10 +2364,30 @@ void
 ilo_state_vector_init(const struct ilo_dev *dev,
                       struct ilo_state_vector *vec)
 {
-   ilo_gpe_set_scissor_null(dev, &vec->scissor);
+   struct ilo_state_urb_info urb_info;
 
-   ilo_gpe_init_zs_surface(dev, NULL, NULL, PIPE_TEXTURE_2D,
-         PIPE_FORMAT_NONE, 0, 0, 1, &vec->fb.null_zs);
+   vec->sample_mask = ~0u;
+
+   ilo_state_viewport_init_data_only(&vec->viewport.vp, dev,
+         vec->viewport.vp_data, sizeof(vec->viewport.vp_data));
+   assert(vec->viewport.vp.array_size >= ILO_MAX_VIEWPORTS);
+
+   vec->viewport.params.matrices = vec->viewport.matrices;
+   vec->viewport.params.scissors = vec->viewport.scissors;
+
+   ilo_state_hs_init_disabled(&vec->disabled_hs, dev);
+   ilo_state_ds_init_disabled(&vec->disabled_ds, dev);
+   ilo_state_gs_init_disabled(&vec->disabled_gs, dev);
+
+   ilo_state_sol_buffer_init_disabled(&vec->so.dummy_sb, dev);
+
+   ilo_state_surface_init_for_null(&vec->fb.null_rt, dev);
+   ilo_state_zs_init_for_null(&vec->fb.null_zs, dev);
+
+   ilo_state_sampler_init_disabled(&vec->disabled_sampler, dev);
+
+   memset(&urb_info, 0, sizeof(urb_info));
+   ilo_state_urb_init(&vec->urb, dev, &urb_info);
 
    util_dynarray_init(&vec->global_binding.bindings);
 
@@ -1314,7 +2404,7 @@ ilo_state_vector_cleanup(struct ilo_state_vector *vec)
          pipe_resource_reference(&vec->vb.states[i].buffer, NULL);
    }
 
-   pipe_resource_reference(&vec->ib.buffer, NULL);
+   pipe_resource_reference(&vec->ib.state.buffer, NULL);
    pipe_resource_reference(&vec->ib.hw_resource, NULL);
 
    for (i = 0; i < vec->so.count; i++)
@@ -1377,7 +2467,7 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          }
       }
 
-      if (vec->ib.buffer == res) {
+      if (vec->ib.state.buffer == res) {
          states |= ILO_DIRTY_IB;
 
          /*
@@ -1392,6 +2482,10 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
 
       for (i = 0; i < vec->so.count; i++) {
          if (vec->so.states[i]->buffer == res) {
+            struct ilo_stream_output_target *target =
+               (struct ilo_stream_output_target *) vec->so.states[i];
+
+            target->sb.bo = ilo_buffer(res)->bo;
             states |= ILO_DIRTY_SO;
             break;
          }
@@ -1456,7 +2550,8 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          struct ilo_surface_cso *cso =
             (struct ilo_surface_cso *) vec->fb.state.zsbuf;
 
-         cso->u.rt.bo = bo;
+         cso->u.zs.depth_bo = bo;
+
          states |= ILO_DIRTY_FB;
       }
    }
diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h
index fd0a3156ebc..3e6fd8a2554 100644
--- a/src/gallium/drivers/ilo/ilo_state.h
+++ b/src/gallium/drivers/ilo/ilo_state.h
@@ -28,13 +28,38 @@
 #ifndef ILO_STATE_H
 #define ILO_STATE_H
 
-#include "core/ilo_state_3d.h"
+#include "core/ilo_builder_3d.h" /* for gen6_3dprimitive_info */
+#include "core/ilo_state_cc.h"
+#include "core/ilo_state_compute.h"
+#include "core/ilo_state_raster.h"
+#include "core/ilo_state_sampler.h"
+#include "core/ilo_state_sbe.h"
+#include "core/ilo_state_shader.h"
+#include "core/ilo_state_sol.h"
+#include "core/ilo_state_surface.h"
+#include "core/ilo_state_urb.h"
+#include "core/ilo_state_vf.h"
+#include "core/ilo_state_viewport.h"
+#include "core/ilo_state_zs.h"
 #include "pipe/p_state.h"
 #include "util/u_dynarray.h"
 
 #include "ilo_common.h"
 
 /**
+ * \see brw_context.h
+ */
+#define ILO_MAX_DRAW_BUFFERS    8
+#define ILO_MAX_CONST_BUFFERS   (1 + 12)
+#define ILO_MAX_SAMPLER_VIEWS   16
+#define ILO_MAX_SAMPLERS        16
+#define ILO_MAX_SO_BINDINGS     64
+#define ILO_MAX_SO_BUFFERS      4
+#define ILO_MAX_VIEWPORTS       1
+
+#define ILO_MAX_SURFACES        256
+
+/**
  * States that we track.
  *
  * XXX Do we want to count each sampler or vertex buffer as a state?  If that
@@ -120,6 +145,172 @@ enum ilo_dirty_flags {
 };
 
 struct ilo_context;
+struct ilo_shader_state;
+
+struct ilo_ve_state {
+   unsigned vb_mapping[PIPE_MAX_ATTRIBS];
+   unsigned vb_count;
+
+   /* these are not valid until the state is finalized */
+   uint32_t vf_data[PIPE_MAX_ATTRIBS][4];
+   struct ilo_state_vf_params_info vf_params;
+   struct ilo_state_vf vf;
+};
+
+struct ilo_vb_state {
+   struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS];
+   struct ilo_state_vertex_buffer vb[PIPE_MAX_ATTRIBS];
+   uint32_t enabled_mask;
+};
+
+struct ilo_ib_state {
+   struct pipe_index_buffer state;
+
+   /* these are not valid until the state is finalized */
+   struct pipe_resource *hw_resource;
+   unsigned hw_index_size;
+   struct ilo_state_index_buffer ib;
+};
+
+struct ilo_cbuf_cso {
+   struct pipe_resource *resource;
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surface;
+
+   /*
+    * this CSO is not so constant because user buffer needs to be uploaded in
+    * finalize_constant_buffers()
+    */
+   const void *user_buffer;
+};
+
+struct ilo_sampler_cso {
+   struct ilo_state_sampler sampler;
+   struct ilo_state_sampler_border border;
+   bool saturate_s;
+   bool saturate_t;
+   bool saturate_r;
+};
+
+struct ilo_sampler_state {
+   const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS];
+};
+
+struct ilo_cbuf_state {
+   struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS];
+   uint32_t enabled_mask;
+};
+
+struct ilo_resource_state {
+   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
+   unsigned count;
+};
+
+struct ilo_view_cso {
+   struct pipe_sampler_view base;
+
+   struct ilo_state_surface surface;
+};
+
+struct ilo_view_state {
+   struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS];
+   unsigned count;
+};
+
+struct ilo_stream_output_target {
+   struct pipe_stream_output_target base;
+
+   struct ilo_state_sol_buffer sb;
+};
+
+struct ilo_so_state {
+   struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS];
+   unsigned count;
+   unsigned append_bitmask;
+
+   struct ilo_state_sol_buffer dummy_sb;
+
+   bool enabled;
+};
+
+struct ilo_rasterizer_state {
+   struct pipe_rasterizer_state state;
+
+   /* these are invalid until finalize_rasterizer() */
+   struct ilo_state_raster_info info;
+   struct ilo_state_raster rs;
+};
+
+struct ilo_viewport_state {
+   struct ilo_state_viewport_matrix_info matrices[ILO_MAX_VIEWPORTS];
+   struct ilo_state_viewport_scissor_info scissors[ILO_MAX_VIEWPORTS];
+   struct ilo_state_viewport_params_info params;
+
+   struct pipe_viewport_state viewport0;
+   struct pipe_scissor_state scissor0;
+
+   struct ilo_state_viewport vp;
+   uint32_t vp_data[20 * ILO_MAX_VIEWPORTS];
+};
+
+struct ilo_surface_cso {
+   struct pipe_surface base;
+
+   bool is_rt;
+   union {
+      struct ilo_state_surface rt;
+      struct ilo_state_zs zs;
+   } u;
+};
+
+struct ilo_fb_state {
+   struct pipe_framebuffer_state state;
+
+   struct ilo_state_surface null_rt;
+   struct ilo_state_zs null_zs;
+
+   struct ilo_fb_blend_caps {
+      bool is_unorm;
+      bool is_integer;
+      bool force_dst_alpha_one;
+
+      bool can_logicop;
+      bool can_blend;
+      bool can_alpha_test;
+   } blend_caps[PIPE_MAX_COLOR_BUFS];
+
+   unsigned num_samples;
+
+   bool has_integer_rt;
+   bool has_hiz;
+   enum gen_depth_format depth_offset_format;
+};
+
+struct ilo_dsa_state {
+   struct ilo_state_cc_depth_info depth;
+
+   struct ilo_state_cc_stencil_info stencil;
+   struct {
+      uint8_t test_mask;
+      uint8_t write_mask;
+   } stencil_front, stencil_back;
+
+   bool alpha_test;
+   float alpha_ref;
+   enum gen_compare_function alpha_func;
+};
+
+struct ilo_blend_state {
+   struct ilo_state_cc_blend_rt_info rt[PIPE_MAX_COLOR_BUFS];
+   struct ilo_state_cc_blend_rt_info dummy_rt;
+   bool dual_blend;
+
+   /* these are invalid until finalize_blend() */
+   struct ilo_state_cc_blend_rt_info effective_rt[PIPE_MAX_COLOR_BUFS];
+   struct ilo_state_cc_info info;
+   struct ilo_state_cc cc;
+   bool alpha_may_kill;
+};
 
 struct ilo_global_binding_cso {
    struct pipe_resource *resource;
@@ -147,6 +338,7 @@ struct ilo_global_binding {
 
 struct ilo_state_vector {
    const struct pipe_draw_info *draw;
+   struct gen6_3dprimitive_info draw_info;
 
    uint32_t dirty;
 
@@ -157,30 +349,41 @@ struct ilo_state_vector {
    struct ilo_shader_state *vs;
    struct ilo_shader_state *gs;
 
+   struct ilo_state_hs disabled_hs;
+   struct ilo_state_ds disabled_ds;
+   struct ilo_state_gs disabled_gs;
+
    struct ilo_so_state so;
 
    struct pipe_clip_state clip;
+
    struct ilo_viewport_state viewport;
-   struct ilo_scissor_state scissor;
 
-   const struct ilo_rasterizer_state *rasterizer;
-   struct pipe_poly_stipple poly_stipple;
+   struct ilo_rasterizer_state *rasterizer;
+
+   struct ilo_state_line_stipple line_stipple;
+   struct ilo_state_poly_stipple poly_stipple;
    unsigned sample_mask;
 
    struct ilo_shader_state *fs;
 
-   const struct ilo_dsa_state *dsa;
+   struct ilo_state_cc_params_info cc_params;
    struct pipe_stencil_ref stencil_ref;
-   const struct ilo_blend_state *blend;
-   struct pipe_blend_color blend_color;
+   const struct ilo_dsa_state *dsa;
+   struct ilo_blend_state *blend;
+
    struct ilo_fb_state fb;
 
+   struct ilo_state_urb urb;
+
    /* shader resources */
    struct ilo_sampler_state sampler[PIPE_SHADER_TYPES];
    struct ilo_view_state view[PIPE_SHADER_TYPES];
    struct ilo_cbuf_state cbuf[PIPE_SHADER_TYPES];
    struct ilo_resource_state resource;
 
+   struct ilo_state_sampler disabled_sampler;
+
    /* GPGPU */
    struct ilo_shader_state *cs;
    struct ilo_resource_state cs_resource;
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
index d2dc2f5b5b4..01c86675202 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
@@ -28,6 +28,9 @@
 #ifndef ILO_SHADER_INTERNAL_H
 #define ILO_SHADER_INTERNAL_H
 
+#include "core/ilo_state_sbe.h"
+#include "core/ilo_state_sol.h"
+
 #include "ilo_common.h"
 #include "ilo_state.h"
 #include "ilo_shader.h"
@@ -72,13 +75,27 @@ struct ilo_shader_variant {
    uint32_t saturate_tex_coords[3];
 };
 
+struct ilo_kernel_routing {
+   bool initialized;
+
+   bool is_point;
+   bool light_twoside;
+   uint32_t sprite_coord_enable;
+   int sprite_coord_mode;
+   int src_len;
+   int src_semantics[PIPE_MAX_SHADER_OUTPUTS];
+   int src_indices[PIPE_MAX_SHADER_OUTPUTS];
+
+   struct ilo_state_sbe sbe;
+};
+
 /**
  * A compiled shader.
  */
 struct ilo_shader {
    struct ilo_shader_variant variant;
 
-   struct ilo_shader_cso cso;
+   union ilo_shader_cso cso;
 
    struct {
       int semantic_names[PIPE_MAX_SHADER_INPUTS];
@@ -111,7 +128,9 @@ struct ilo_shader {
 
    bool stream_output;
    int svbi_post_inc;
-   struct pipe_stream_output_info so_info;
+
+   uint32_t sol_data[PIPE_MAX_SO_OUTPUTS][2];
+   struct ilo_state_sol sol;
 
    /* for VS stream output / rasterizer discard */
    int gs_offsets[3];
@@ -121,11 +140,8 @@ struct ilo_shader {
    void *kernel;
    int kernel_size;
 
-   bool routing_initialized;
-   int routing_src_semantics[PIPE_MAX_SHADER_OUTPUTS];
-   int routing_src_indices[PIPE_MAX_SHADER_OUTPUTS];
-   uint32_t routing_sprite_coord_enable;
    struct ilo_kernel_routing routing;
+   struct ilo_state_ps_params_info ps_params;
 
    /* what does the push constant buffer consist of? */
    struct {
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
index 65e47bf3a4a..d38585f1475 100644
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -2036,9 +2036,6 @@ parse_instruction(struct toy_tgsi *tgsi,
       if (!dst_is_scratch[i])
          continue;
 
-      if (tgsi_inst->Instruction.Saturate == TGSI_SAT_MINUS_PLUS_ONE)
-         tc_fail(tgsi->tc, "TGSI_SAT_MINUS_PLUS_ONE unhandled");
-
       tgsi->tc->templ.saturate = tgsi_inst->Instruction.Saturate;
 
       /* emit indirect store */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index b6c32ffb979..b25e0413750 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -975,10 +975,6 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
                                          s_bld.int_vec_type, "");
       }
 
-      /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
-
       s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
                                           stencil_refs, stencil_vals,
                                           front_facing);
diff --git a/src/gallium/drivers/llvmpipe/lp_public.h b/src/gallium/drivers/llvmpipe/lp_public.h
index ec6b660b48e..27ab1baefbb 100644
--- a/src/gallium/drivers/llvmpipe/lp_public.h
+++ b/src/gallium/drivers/llvmpipe/lp_public.h
@@ -1,10 +1,18 @@
 #ifndef LP_PUBLIC_H
 #define LP_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct sw_winsys;
 
 struct pipe_screen *
 llvmpipe_create_screen(struct sw_winsys *winsys);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index 4f8bab62e7b..fc593670671 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -315,7 +315,7 @@ llvmpipe_check_render_cond(struct llvmpipe_context *lp)
 
    b = pipe->get_query_result(pipe, lp->render_cond_query, wait, (void*)&result);
    if (b)
-      return (!result == lp->render_cond_cond);
+      return ((!result) == lp->render_cond_cond);
    else
       return TRUE;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index f4ba596f358..47f1897c732 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -165,7 +165,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
       return 1;
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
-      return 0;
+      return 1;
    case PIPE_CAP_TGSI_INSTANCEID:
    case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
    case PIPE_CAP_START_INSTANCE:
@@ -258,8 +258,9 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
       return 1;
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-   case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 1;
    case PIPE_CAP_FAKE_SW_MSAA:
       return 1;
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
@@ -290,6 +291,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 96cc77c250c..4c8167a9e7d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -854,9 +854,10 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                      jit_tex->img_stride[j] = lp_tex->img_stride[j];
                   }
 
-                  if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                      res->target == PIPE_TEXTURE_2D_ARRAY ||
-                      res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                      view->target == PIPE_TEXTURE_2D_ARRAY ||
+                      view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      /*
                       * For array textures, we don't have first_layer, instead
                       * adjust last_layer (stored as depth) plus the mip level offsets
@@ -868,7 +869,8 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                         jit_tex->mip_offsets[j] += view->u.tex.first_layer *
                                                    lp_tex->img_stride[j];
                      }
-                     if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                     if (view->target == PIPE_TEXTURE_CUBE ||
+                         view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                         assert(jit_tex->depth % 6 == 0);
                      }
                      assert(view->u.tex.first_layer <= view->u.tex.last_layer);
@@ -1067,10 +1069,13 @@ try_update_scene_state( struct lp_setup_context *setup )
    if (setup->dirty & LP_SETUP_NEW_CONSTANTS) {
       for (i = 0; i < Elements(setup->constants); ++i) {
          struct pipe_resource *buffer = setup->constants[i].current.buffer;
-         const unsigned current_size = setup->constants[i].current.buffer_size;
+         const unsigned current_size = MIN2(setup->constants[i].current.buffer_size,
+                                            LP_MAX_TGSI_CONST_BUFFER_SIZE);
          const ubyte *current_data = NULL;
          int num_constants;
 
+         STATIC_ASSERT(DATA_BLOCK_SIZE >= LP_MAX_TGSI_CONST_BUFFER_SIZE);
+
          if (buffer) {
             /* resource buffer */
             current_data = (ubyte *) llvmpipe_resource_data(buffer);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 35fe7b20181..b5ce8683f1a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -260,7 +260,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
 {
    const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
-   LLVMTypeRef vec_type;
+   struct lp_type int_type = lp_int_type(type);
+   LLVMTypeRef vec_type, int_vec_type;
    LLVMValueRef mask_ptr, mask_val;
    LLVMValueRef consts_ptr, num_consts_ptr;
    LLVMValueRef z;
@@ -295,7 +296,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
       zs_format_desc = util_format_description(key->zsbuf_format);
       assert(zs_format_desc);
 
-      if (!shader->info.base.writes_z) {
+      if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
          if (key->alpha.enabled ||
              key->blend.alpha_to_coverage ||
              shader->info.base.uses_kill) {
@@ -329,11 +330,14 @@ generate_fs_loop(struct gallivm_state *gallivm,
       depth_mode = 0;
    }
 
+   vec_type = lp_build_vec_type(gallivm, type);
+   int_vec_type = lp_build_vec_type(gallivm, int_type);
 
    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
-
-   vec_type = lp_build_vec_type(gallivm, type);
+   /* convert scalar stencil refs into vectors */
+   stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
+   stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
 
    consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
    num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
@@ -462,7 +466,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
       int pos0 = find_output_by_semantic(&shader->info.base,
                                          TGSI_SEMANTIC_POSITION,
                                          0);
-
+      int s_out = find_output_by_semantic(&shader->info.base,
+                                          TGSI_SEMANTIC_STENCIL,
+                                          0);
       if (pos0 != -1 && outputs[pos0][2]) {
          z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
 
@@ -512,6 +518,15 @@ generate_fs_loop(struct gallivm_state *gallivm,
          }
       }
 
+      if (s_out != -1 && outputs[s_out][1]) {
+         /* there's only one value, and spec says to discard additional bits */
+         LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
+         stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
+         stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
+         stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
+         stencil_refs[1] = stencil_refs[0];
+      }
+
       lp_build_depth_stencil_load_swizzled(gallivm, type,
                                            zs_format_desc, key->resource_1d,
                                            depth_ptr, depth_stride,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 21da6290574..b205f02fdba 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -170,6 +170,36 @@ llvmpipe_create_sampler_view(struct pipe_context *pipe,
       view->texture = NULL;
       pipe_resource_reference(&view->texture, texture);
       view->context = pipe;
+
+#ifdef DEBUG
+     /*
+      * This is possibly too lenient, but the primary reason is just
+      * to catch state trackers which forget to initialize this, so
+      * it only catches clearly impossible view targets.
+      */
+      if (view->target != texture->target) {
+         if (view->target == PIPE_TEXTURE_1D)
+            assert(texture->target == PIPE_TEXTURE_1D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_1D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_1D);
+         else if (view->target == PIPE_TEXTURE_2D)
+            assert(texture->target == PIPE_TEXTURE_2D_ARRAY ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_2D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_2D ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE)
+            assert(texture->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else
+            assert(0);
+      }
+#endif
    }
 
    return view;
@@ -245,15 +275,17 @@ prepare_shader_sampling(
                   row_stride[j] = lp_tex->row_stride[j];
                   img_stride[j] = lp_tex->img_stride[j];
                }
-               if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                   res->target == PIPE_TEXTURE_2D_ARRAY ||
-                   res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                   view->target == PIPE_TEXTURE_2D_ARRAY ||
+                   view->target == PIPE_TEXTURE_CUBE ||
+                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
                                        lp_tex->img_stride[j];
                   }
-                  if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      assert(num_layers % 6 == 0);
                   }
                   assert(view->u.tex.first_layer <= view->u.tex.last_layer);
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
index 08f968f7f0a..96f8ed82cd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -42,13 +42,6 @@ lp_resource_copy(struct pipe_context *pipe,
                  struct pipe_resource *src, unsigned src_level,
                  const struct pipe_box *src_box)
 {
-   struct llvmpipe_resource *src_tex = llvmpipe_resource(src);
-   struct llvmpipe_resource *dst_tex = llvmpipe_resource(dst);
-   const enum pipe_format format = src_tex->base.format;
-   unsigned width = src_box->width;
-   unsigned height = src_box->height;
-   unsigned depth = src_box->depth;
-
    llvmpipe_flush_resource(pipe,
                            dst, dst_level,
                            FALSE, /* read_only */
@@ -63,58 +56,8 @@ lp_resource_copy(struct pipe_context *pipe,
                            FALSE, /* do_not_block */
                            "blit src");
 
-   /* Fallback for buffers. */
-   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-      util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
-                                src, src_level, src_box);
-      return;
-   }
-
-   /*
-   printf("surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u %u x %u x %u\n",
-          src_tex->id, src_level, dst_tex->id, dst_level,
-          src_box->x, src_box->y, src_box->z, dstx, dsty, dstz,
-          src_box->width, src_box->height, src_box->depth);
-   */
-
-   /* make sure display target resources (which cannot have levels/layers) are mapped */
-   if (src_tex->dt)
-      (void) llvmpipe_resource_map(src, src_level, 0, LP_TEX_USAGE_READ);
-   if (dst_tex->dt)
-      /*
-       * Could set this to WRITE_ALL if complete dst is covered but it gets
-       * ignored anyway.
-       */
-      (void) llvmpipe_resource_map(dst, dst_level, 0, LP_TEX_USAGE_READ_WRITE);
-
-
-   /* copy */
-   {
-      const ubyte *src_linear_ptr
-         = llvmpipe_get_texture_image_address(src_tex, src_box->z,
-                                              src_level);
-      ubyte *dst_linear_ptr
-         = llvmpipe_get_texture_image_address(dst_tex, dstz,
-                                              dst_level);
-
-      if (dst_linear_ptr && src_linear_ptr) {
-         util_copy_box(dst_linear_ptr, format,
-                       llvmpipe_resource_stride(&dst_tex->base, dst_level),
-                       dst_tex->img_stride[dst_level],
-                       dstx, dsty, 0,
-                       width, height, depth,
-                       src_linear_ptr,
-                       llvmpipe_resource_stride(&src_tex->base, src_level),
-                       src_tex->img_stride[src_level],
-                       src_box->x, src_box->y, 0);
-      }
-   }
-
-   if (src_tex->dt)
-      llvmpipe_resource_unmap(src, 0, 0);
-   if (dst_tex->dt)
-      llvmpipe_resource_unmap(dst, 0, 0);
-
+   util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
+                             src, src_level, src_box);
 }
 
 
@@ -139,11 +82,6 @@ static void lp_blit(struct pipe_context *pipe,
       return; /* done */
    }
 
-   if (info.mask & PIPE_MASK_S) {
-      debug_printf("llvmpipe: cannot blit stencil, skipping\n");
-      info.mask &= ~PIPE_MASK_S;
-   }
-
    if (!util_blitter_is_blit_supported(lp->blitter, &info)) {
       debug_printf("llvmpipe: blit unsupported %s -> %s\n",
                    util_format_short_name(info.src.resource->format),
diff --git a/src/gallium/drivers/nouveau/Android.mk b/src/gallium/drivers/nouveau/Android.mk
index 420c8e5734c..daf3abd1bb3 100644
--- a/src/gallium/drivers/nouveau/Android.mk
+++ b/src/gallium/drivers/nouveau/Android.mk
@@ -39,6 +39,10 @@ LOCAL_SRC_FILES := \
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_nouveau
 LOCAL_MODULE := libmesa_pipe_nouveau
 
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_C_INCLUDES := external/libcxx/include
+else
 include external/stlport/libstlport.mk
+endif
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index 0aefc031210..d05f0a17ab4 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -48,7 +48,7 @@ nouveau_compiler_SOURCES = \
 
 nouveau_compiler_LDADD = \
 	libnouveau.la \
-	../../auxiliary/libgallium.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index be17871edd4..b9c05a04b9a 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -11,7 +11,7 @@
 // SIZE:    22 / 14 * 8 bytes
 //
 gk110_div_u32:
-   sched 0x28282804280428
+   sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28
    bfind u32 $r2 $r1
    xor b32 $r2 $r2 0x1f
    mov b32 $r3 0x1
@@ -19,7 +19,7 @@ gk110_div_u32:
    cvt u32 $r1 neg u32 $r1
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
-   sched 0x28282828282828
+   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
@@ -27,7 +27,7 @@ gk110_div_u32:
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
-   sched 0x042c2828042804
+   sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mov b32 $r3 $r0
    mul high $r0 u32 $r0 u32 $r2
@@ -35,7 +35,7 @@ gk110_div_u32:
    add $r1 (mul u32 $r1 u32 $r0) $r3
    set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
-   sched 0x20282e20042c28
+   sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20
    $p0 add b32 $r0 $r0 0x1
    $p0 set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
@@ -51,7 +51,7 @@ gk110_div_u32:
 gk110_div_s32:
    set $p2 0x1 lt s32 $r0 0x0
    set $p3 0x1 lt s32 $r1 0x0 xor $p2
-   sched 0x28042804282820
+   sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28
    cvt s32 $r0 abs s32 $r0
    cvt s32 $r1 abs s32 $r1
    bfind u32 $r2 $r1
@@ -59,7 +59,7 @@ gk110_div_s32:
    mov b32 $r3 0x1
    shl b32 $r2 $r3 clamp $r2
    cvt u32 $r1 neg u32 $r1
-   sched 0x28282828282828
+   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
@@ -67,7 +67,7 @@ gk110_div_s32:
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
-   sched 0x28280428042828
+   sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
@@ -75,7 +75,7 @@ gk110_div_s32:
    mul high $r0 u32 $r0 u32 $r2
    cvt u32 $r2 neg u32 $r1
    add $r1 (mul u32 $r1 u32 $r0) $r3
-   sched 0x2028042c28042c
+   sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
    set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
@@ -83,7 +83,7 @@ gk110_div_s32:
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
    $p3 cvt s32 $r0 neg s32 $r0
-   sched 0x2c200428042e04
+   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
    $p2 cvt s32 $r1 neg s32 $r1
    ret
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 6bb9620d5f7..ab8bf2e5504 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -967,8 +967,8 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i)
       code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0);
       if (i->defExists(1))
          defId(i->def(1), 2);
-   else
-      code[0] |= 0x1c;
+      else
+         code[0] |= 0x1c;
    } else {
       switch (i->sType) {
       case TYPE_F32: op2 = 0x000; op1 = 0x800; break;
@@ -990,8 +990,12 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i)
       }
       FTZ_(3a);
 
-      if (i->dType == TYPE_F32)
-         code[1] |= 1 << 23;
+      if (i->dType == TYPE_F32) {
+         if (isFloatType(i->sType))
+            code[1] |= 1 << 23;
+         else
+            code[1] |= 1 << 15;
+      }
    }
    if (i->sType == TYPE_S32)
       code[1] |= 1 << 19;
@@ -1316,6 +1320,8 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
    } else
    if (mask & 2) {
       int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      if (writeIssueDelays && !(f->target.bb->binPos & 0x3f))
+         pcRel += 8;
       // currently we don't want absolute branches
       assert(!f->absolute);
       code[0] |= (pcRel & 0x1ff) << 23;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 22db368b371..399a6f1db13 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -509,10 +509,13 @@ CodeEmitterGM107::emitBRA()
    emitCond5(0x00, CC_TR);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
+      int32_t pos = insn->target.bb->binPos;
+      if (writeIssueDelays && !(pos & 0x1f))
+         pos += 8;
       if (!insn->absolute)
-         emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
+         emitField(0x14, 24, pos - (codeSize + 8));
       else
-         emitField(0x14, 32, insn->target.bb->binPos);
+         emitField(0x14, 32, pos);
    } else {
       emitCBUF (0x24, gpr, 20, 16, 0, insn->src(0));
       emitField(0x05, 1, 1);
@@ -1827,6 +1830,7 @@ CodeEmitterGM107::emitISET()
    emitCond3(0x31, insn->setCond);
    emitField(0x30, 1, isSignedType(insn->sType));
    emitCC   (0x2f);
+   emitField(0x2c, 1, insn->dType == TYPE_F32);
    emitX    (0x2b);
    emitGPR  (0x08, insn->src(0));
    emitGPR  (0x00, insn->def(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index d9aed34a0ce..472e3a84119 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1078,8 +1078,14 @@ CodeEmitterNVC0::emitSET(const CmpInstruction *i)
    if (!isFloatType(i->sType))
       lo = 0x3;
 
-   if (isFloatType(i->dType) || isSignedIntType(i->sType))
+   if (isSignedIntType(i->sType))
       lo |= 0x20;
+   if (isFloatType(i->dType)) {
+      if (isFloatType(i->sType))
+         lo |= 0x20;
+      else
+         lo |= 0x80;
+   }
 
    switch (i->op) {
    case OP_SET_AND: hi = 0x10000000; break;
@@ -1406,6 +1412,8 @@ CodeEmitterNVC0::emitFlow(const Instruction *i)
    } else
    if (mask & 2) {
       int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      if (writeIssueDelays && !(f->target.bb->binPos & 0x3f))
+         pcRel += 8;
       // currently we don't want absolute branches
       assert(!f->absolute);
       code[0] |= (pcRel & 0x3f) << 26;
@@ -2712,7 +2720,6 @@ private:
 
    RegScores *score; // for current BB
    std::vector<RegScores> scoreBoards;
-   int cycle;
    int prevData;
    operation prevOp;
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 254629f907a..ecd115f9807 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1316,7 +1316,7 @@ private:
    };
 
 private:
-   const struct tgsi::Source *code;
+   const tgsi::Source *code;
    const struct nv50_ir_prog_info *info;
 
    struct {
@@ -1356,18 +1356,20 @@ Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c)
 {
    const int swz = src.getSwizzle(c);
 
+   /* TODO: Use Array ID when it's available for the index */
    return makeSym(src.getFile(),
                   src.is2D() ? src.getIndex(1) : 0,
-                  src.isIndirect(0) ? -1 : src.getIndex(0), swz,
+                  src.getIndex(0), swz,
                   src.getIndex(0) * 16 + swz * 4);
 }
 
 Symbol *
 Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c)
 {
+   /* TODO: Use Array ID when it's available for the index */
    return makeSym(dst.getFile(),
                   dst.is2D() ? dst.getIndex(1) : 0,
-                  dst.isIndirect(0) ? -1 : dst.getIndex(0), c,
+                  dst.getIndex(0), c,
                   dst.getIndex(0) * 16 + c * 4);
 }
 
@@ -1604,19 +1606,8 @@ Converter::storeDst(int d, int c, Value *val)
 {
    const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
 
-   switch (tgsi.getSaturate()) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
+   if (tgsi.getSaturate()) {
       mkOp1(OP_SAT, dstTy, val, val);
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f));
-      mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f));
-      break;
-   default:
-      assert(!"invalid saturation mode");
-      break;
    }
 
    Value *ptr = NULL;
@@ -1955,13 +1946,13 @@ isResourceSpecial(const int r)
 }
 
 static inline bool
-isResourceRaw(const struct tgsi::Source *code, const int r)
+isResourceRaw(const tgsi::Source *code, const int r)
 {
    return isResourceSpecial(r) || code->resources[r].raw;
 }
 
 static inline nv50_ir::TexTarget
-getResourceTarget(const struct tgsi::Source *code, int r)
+getResourceTarget(const tgsi::Source *code, int r)
 {
    if (isResourceSpecial(r))
       return nv50_ir::TEX_TARGET_BUFFER;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 64989ac8846..596ac95d489 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -240,6 +240,7 @@ GM107LoweringPass::visit(Instruction *i)
             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                     i->getIndirect(0, 0), bld.mkImm(4));
             i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
          } else {
             i->op = OP_VFETCH;
             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 1ad086094dc..2c7f7e326b2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -887,7 +887,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i)
       }
    }
    bld.setPosition(joinBB, false);
-   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index b61f3c49bb9..7a5d1ce0299 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -100,8 +100,7 @@ void
 NVC0LegalizeSSA::handleFTZ(Instruction *i)
 {
    // Only want to flush float inputs
-   if (i->sType != TYPE_F32)
-      return;
+   assert(i->sType == TYPE_F32);
 
    // If we're already flushing denorms (and NaN's) to zero, no need for this.
    if (i->dnz)
@@ -129,7 +128,7 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
    Instruction *next;
    for (Instruction *i = bb->getEntry(); i; i = next) {
       next = i->next;
-      if (i->dType == TYPE_F32) {
+      if (i->sType == TYPE_F32) {
          if (prog->getType() != Program::TYPE_COMPUTE)
             handleFTZ(i);
          continue;
@@ -169,7 +168,7 @@ NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 
 void
 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
-                              Instruction *usei, const Instruction *insn)
+                              Instruction *usei, const Instruction *texi)
 {
    bool add = true;
    for (std::list<TexUse>::iterator it = uses.begin();
@@ -184,7 +183,7 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
          ++it;
    }
    if (add)
-      uses.push_back(TexUse(usei, insn));
+      uses.push_back(TexUse(usei, texi));
 }
 
 void
@@ -196,7 +195,8 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
       insn = insn->getSrc(0)->getUniqueInsn();
 
-   if (!insn->bb->reachableBy(texi->bb, term))
+   // NOTE: the tex itself is, of course, not an overwriting definition
+   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
       return;
 
    switch (insn->op) {
@@ -244,7 +244,12 @@ NVC0LegalizePostRA::findFirstUses(
          visited.insert(usei);
 
          if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases
+            // need a barrier before WAW cases, like:
+            //   %r0 = tex
+            //   if ...
+            //     texbar <- is required or tex might replace x again
+            //     %r1 = x <- overwriting def
+            //   %r2 = phi %r0, %r1
             for (int s = 0; usei->srcExists(s); ++s) {
                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
                if (defi && &usei->src(s) != *u)
@@ -263,7 +268,7 @@ NVC0LegalizePostRA::findFirstUses(
              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
             findFirstUses(texi, usei, uses, visited);
          } else {
-            addTexUse(uses, usei, insn);
+            addTexUse(uses, usei, texi);
          }
       }
    }
@@ -1751,6 +1756,7 @@ NVC0LoweringPass::visit(Instruction *i)
             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                     i->getIndirect(0, 0), bld.mkImm(4));
             i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
          } else {
             i->op = OP_VFETCH;
             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 14446b6b53f..ae739eeda83 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -236,6 +236,9 @@ LoadPropagation::visit(BasicBlock *bb)
       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
          continue;
 
+      if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
+         continue;
+
       if (i->srcExists(1))
          checkSwapSrc01(i);
 
@@ -278,7 +281,6 @@ private:
 
    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 
-   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
    CmpInstruction *findOriginForTestWithZero(Value *);
 
    unsigned int foldCount;
@@ -337,25 +339,33 @@ ConstantFolding::findOriginForTestWithZero(Value *value)
       return NULL;
    Instruction *insn = value->getInsn();
 
-   while (insn && insn->op != OP_SET) {
-      Instruction *next = NULL;
-      switch (insn->op) {
-      case OP_NEG:
-      case OP_ABS:
-      case OP_CVT:
-         next = insn->getSrc(0)->getInsn();
-         if (insn->sType != next->dType)
+   if (insn->asCmp() && insn->op != OP_SLCT)
+      return insn->asCmp();
+
+   /* Sometimes mov's will sneak in as a result of other folding. This gets
+    * cleaned up later.
+    */
+   if (insn->op == OP_MOV)
+      return findOriginForTestWithZero(insn->getSrc(0));
+
+   /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
+   if (insn->op == OP_AND) {
+      int s = 0;
+      ImmediateValue imm;
+      if (!insn->src(s).getImmediate(imm)) {
+         s = 1;
+         if (!insn->src(s).getImmediate(imm))
             return NULL;
-         break;
-      case OP_MOV:
-         next = insn->getSrc(0)->getInsn();
-         break;
-      default:
-         return NULL;
       }
-      insn = next;
+      if (imm.reg.data.f32 != 1.0f)
+         return NULL;
+      /* TODO: Come up with a way to handle the condition being inverted */
+      if (insn->src(!s).mod != Modifier(0))
+         return NULL;
+      return findOriginForTestWithZero(insn->getSrc(!s));
    }
-   return insn ? insn->asCmp() : NULL;
+
+   return NULL;
 }
 
 void
@@ -574,6 +584,11 @@ ConstantFolding::expr(Instruction *i,
    case OP_POPCNT:
       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
       break;
+   case OP_PFETCH:
+      // The two arguments to pfetch are logically added together. Normally
+      // the second argument will not be constant, but that can happen.
+      res.data.u32 = a->data.u32 + b->data.u32;
+      break;
    default:
       return;
    }
@@ -588,7 +603,9 @@ ConstantFolding::expr(Instruction *i,
 
    i->getSrc(0)->reg.data = res.data;
 
-   if (i->op == OP_MAD || i->op == OP_FMA) {
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA: {
       i->op = OP_ADD;
 
       i->setSrc(1, i->getSrc(0));
@@ -603,8 +620,14 @@ ConstantFolding::expr(Instruction *i,
          bld.setPosition(i, false);
          i->setSrc(1, bld.loadImm(NULL, res.data.u32));
       }
-   } else {
+      break;
+   }
+   case OP_PFETCH:
+      // Leave PFETCH alone... we just folded its 2 args into 1.
+      break;
+   default:
       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+      break;
    }
    i->subOp = 0;
 }
@@ -946,33 +969,82 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
    case OP_SET: // TODO: SET_AND,OR,XOR
    {
+      /* This optimizes the case where the output of a set is being compared
+       * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
+       * can be a lot cleverer in our comparison.
+       */
       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
       CondCode cc, ccZ;
-      if (i->src(t).mod != Modifier(0))
-         return;
-      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+      if (imm0.reg.data.u32 != 0 || !si)
          return;
       cc = si->setCond;
       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+      // We do everything assuming var (cmp) 0, reverse the condition if 0 is
+      // first.
       if (s == 0)
          ccZ = reverseCondCode(ccZ);
+      // If there is a negative modifier, we need to undo that, by flipping
+      // the comparison to zero.
+      if (i->src(t).mod.neg())
+         ccZ = reverseCondCode(ccZ);
+      // If this is a signed comparison, we expect the input to be a regular
+      // boolean, i.e. 0/-1. However the rest of the logic assumes that true
+      // is positive, so just flip the sign.
+      if (i->sType == TYPE_S32) {
+         assert(!isFloatType(si->dType));
+         ccZ = reverseCondCode(ccZ);
+      }
       switch (ccZ) {
-      case CC_LT: cc = CC_FL; break;
-      case CC_GE: cc = CC_TR; break;
-      case CC_EQ: cc = inverseCondCode(cc); break;
-      case CC_LE: cc = inverseCondCode(cc); break;
-      case CC_GT: break;
-      case CC_NE: break;
+      case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
+      case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
+      case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
+      case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
+      case CC_GT: break; // bool > 0 -- bool
+      case CC_NE: break; // bool != 0 -- bool
       default:
          return;
       }
+
+      // Update the condition of this SET to be identical to the origin set,
+      // but with the updated condition code. The original SET should get
+      // DCE'd, ideally.
+      i->op = si->op;
       i->asCmp()->setCond = cc;
       i->setSrc(0, si->src(0));
       i->setSrc(1, si->src(1));
+      if (si->srcExists(2))
+         i->setSrc(2, si->src(2));
       i->sType = si->sType;
    }
       break;
 
+   case OP_AND:
+   {
+      CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
+      if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+         return;
+      if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+         return;
+      if (imm0.reg.data.f32 != 1.0)
+         return;
+      if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
+         return;
+
+      i->getSrc(t)->getInsn()->dType = TYPE_F32;
+      if (i->src(t).mod != Modifier(0)) {
+         assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+         i->src(t).mod = Modifier(0);
+         cmp->setCond = inverseCondCode(cmp->setCond);
+      }
+      i->op = OP_MOV;
+      i->setSrc(s, NULL);
+      if (t) {
+         i->setSrc(0, i->getSrc(t));
+         i->setSrc(t, NULL);
+      }
+   }
+      break;
+
    case OP_SHL:
    {
       if (s != 1 || i->src(0).mod != Modifier(0))
@@ -2216,7 +2288,7 @@ FlatteningPass::visit(BasicBlock *bb)
              insn->op != OP_LINTERP && // probably just nve4
              insn->op != OP_PINTERP && // probably just nve4
              ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
-              typeSizeof(insn->dType) <= 4) &&
+              (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
              !insn->isNop()) {
             insn->join = 1;
             bb->remove(bb->getExit());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 178a1671c3f..ca545a6024a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -84,7 +84,7 @@ static const struct opProperties _initProps[] =
    //           neg  abs  not  sat  c[]  s[], a[], imm
    { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
    { OP_SUB,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
-   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
    { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x1, 0x1, 0x0 }, // special constraint
@@ -188,6 +188,9 @@ void TargetNV50::initOpInfo()
       if (prop->mSat & 8)
          opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
    }
+
+   if (chipset >= 0xa0)
+      opInfo[OP_MUL].dstMods = NV50_IR_MOD_SAT;
 }
 
 unsigned int
@@ -413,6 +416,8 @@ TargetNV50::isOpSupported(operation op, DataType ty) const
       return false;
    case OP_SAD:
       return ty == TYPE_S32;
+   case OP_SET:
+      return !isFloatType(ty);
    default:
       return true;
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 32fa65c8a51..09cdbb53ecb 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -658,13 +658,13 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       switch (buffer->base.usage) {
       case PIPE_USAGE_DEFAULT:
       case PIPE_USAGE_IMMUTABLE:
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
          break;
       case PIPE_USAGE_DYNAMIC:
          /* For most apps, we'd have to do staging transfers to avoid sync
           * with this usage, and GART -> GART copies would be suboptimal.
           */
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
          break;
       case PIPE_USAGE_STAGING:
       case PIPE_USAGE_STREAM:
@@ -676,7 +676,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       }
    } else {
       if (buffer->base.bind & screen->vidmem_bindings)
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
       else
       if (buffer->base.bind & screen->sysmem_bindings)
          buffer->domain = NOUVEAU_BO_GART;
diff --git a/src/gallium/drivers/nouveau/nouveau_heap.h b/src/gallium/drivers/nouveau/nouveau_heap.h
index d0b22844ad0..a3d64a65623 100644
--- a/src/gallium/drivers/nouveau/nouveau_heap.h
+++ b/src/gallium/drivers/nouveau/nouveau_heap.h
@@ -23,6 +23,26 @@
 #ifndef __NOUVEAU_HEAP_H__
 #define __NOUVEAU_HEAP_H__
 
+/* This datastructure represents a memory allocation heap. Fundamentally, this
+ * is a doubly-linked list with a few properties, and a usage convention.
+ *
+ * On initial allocation, there is a single node with the full size that's
+ * marked as not in-use. As allocations are made, blocks are taken off the end
+ * of that first node, and inserted right after it. If the first node doesn't
+ * have enough free space, we look for free space down in the rest of the
+ * list. This can happen if an allocation is made and then freed.
+ *
+ * The first node will remain with in_use == 0 even if the whole heap is
+ * exhausted. Another invariant is that there will never be two sequential
+ * in_use == 0 nodes. If a node is freed and it has one (or both) adjacent
+ * free nodes, they are merged into one, and the relevant heap entries are
+ * freed.
+ *
+ * The pattern to free the whole heap is to start with the first node and then
+ * just free the "next" node, until there is no next node. This should assure
+ * that at the end the first (and only) node is not in use and contains the
+ * full size of the heap.
+ */
 struct nouveau_heap {
 	struct nouveau_heap *prev;
 	struct nouveau_heap *next;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index b4f1413fd8b..c6e5074db19 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -164,6 +164,16 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 		size = sizeof(nvc0_data);
 	}
 
+	/*
+	 * Set default VRAM domain if not overridden
+	 */
+	if (!screen->vram_domain) {
+		if (dev->vram_size > 0)
+			screen->vram_domain = NOUVEAU_BO_VRAM;
+		else
+			screen->vram_domain = NOUVEAU_BO_GART;
+	}
+
 	ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
 				 data, size, &screen->channel);
 	if (ret)
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index cf06f7e88aa..30041b271c9 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -51,6 +51,8 @@ struct nouveau_screen {
 
 	boolean hint_buf_keep_sysmem_copy;
 
+	unsigned vram_domain;
+
 	struct {
 		unsigned profiles_checked;
 		unsigned profiles_present;
@@ -94,6 +96,8 @@ struct nouveau_screen {
 #endif
 };
 
+#define NV_VRAM_DOMAIN(screen) ((screen)->vram_domain)
+
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
 # define NOUVEAU_DRV_STAT(s, n, v) do {         \
       (s)->stats.named.n += (v);               \
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index 1ab8929cc38..83fd1fa38dd 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers,
    struct pipe_framebuffer_state *fb = &nv30->framebuffer;
    uint32_t colr = 0, zeta = 0, mode = 0;
 
-   if (!nv30_state_validate(nv30, TRUE))
+   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 7b32aaee936..592cdbe24f9 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -204,7 +204,7 @@ void
 nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
 boolean
-nv30_state_validate(struct nv30_context *nv30, boolean hwtnl);
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl);
 
 void
 nv30_state_release(struct nv30_context *nv30);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 3575c3d29fa..c1665b7ad2f 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -71,12 +71,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render,
    struct nv30_render *r = nv30_render(render);
    struct nv30_context *nv30 = r->nv30;
 
-   r->length = vertex_size * nr_vertices;
+   r->length = (uint32_t)vertex_size * (uint32_t)nr_vertices;
 
    if (r->offset + r->length >= render->max_vertex_buffer_bytes) {
       pipe_resource_reference(&r->buffer, NULL);
       r->buffer = pipe_buffer_create(&nv30->screen->base.base,
-                                     PIPE_BIND_VERTEX_BUFFER, 0,
+                                     PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM,
                                      render->max_vertex_buffer_bytes);
       if (!r->buffer)
          return FALSE;
@@ -91,10 +91,14 @@ static void *
 nv30_render_map_vertices(struct vbuf_render *render)
 {
    struct nv30_render *r = nv30_render(render);
-   char *map = pipe_buffer_map(&r->nv30->base.pipe, r->buffer,
-                               PIPE_TRANSFER_WRITE |
-                               PIPE_TRANSFER_UNSYNCHRONIZED, &r->transfer);
-   return map + r->offset;
+   char *map = pipe_buffer_map_range(
+         &r->nv30->base.pipe, r->buffer,
+         r->offset, r->length,
+         PIPE_TRANSFER_WRITE |
+         PIPE_TRANSFER_DISCARD_RANGE,
+         &r->transfer);
+   assert(map);
+   return map;
 }
 
 static void
@@ -103,6 +107,7 @@ nv30_render_unmap_vertices(struct vbuf_render *render,
 {
    struct nv30_render *r = nv30_render(render);
    pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer);
+   r->transfer = NULL;
 }
 
 static void
@@ -126,10 +131,10 @@ nv30_render_draw_elements(struct vbuf_render *render,
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
                        nv04_resource(r->buffer), r->offset + r->vtxptr[i],
-                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, FALSE))
+   if (!nv30_state_validate(nv30, ~0, FALSE))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -171,10 +176,10 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
                        nv04_resource(r->buffer), r->offset + r->vtxptr[i],
-                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, FALSE))
+   if (!nv30_state_validate(nv30, ~0, FALSE))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -213,22 +218,24 @@ static const struct {
    [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
    [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
    [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_GENERIC ] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
 };
 
 static boolean
 vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
 {
-   struct pipe_screen *pscreen = &r->nv30->screen->base.base;
+   struct nv30_screen *screen = r->nv30->screen;
    struct nv30_fragprog *fp = r->nv30->fragprog.program;
    struct vertex_info *vinfo = &r->vertex_info;
    enum pipe_format format;
    uint emit = EMIT_OMIT;
    uint result = *idx;
 
-   if (sem == TGSI_SEMANTIC_GENERIC && result >= 8) {
-      for (result = 0; result < 8; result++) {
-         if (fp->texcoord[result] == *idx) {
+   if (sem == TGSI_SEMANTIC_GENERIC) {
+      uint num_texcoords = (screen->eng3d->oclass < NV40_3D_CLASS) ? 8 : 10;
+      for (result = 0; result < num_texcoords; result++) {
+         if (fp->texcoord[result] == *idx + 8) {
+            sem = TGSI_SEMANTIC_TEXCOORD;
             emit = vroute[sem].emit;
             break;
          }
@@ -243,11 +250,11 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
    format = draw_translate_vinfo_format(emit);
 
-   r->vtxfmt[attrib] = nv30_vtxfmt(pscreen, format)->hw;
-   r->vtxptr[attrib] = vinfo->size | NV30_3D_VTXBUF_DMA1;
+   r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
+   r->vtxptr[attrib] = vinfo->size;
    vinfo->size += draw_translate_vinfo_size(emit);
 
-   if (nv30_screen(pscreen)->eng3d->oclass < NV40_3D_CLASS) {
+   if (screen->eng3d->oclass < NV40_3D_CLASS) {
       r->vtxprog[attrib][0] = 0x001f38d8;
       r->vtxprog[attrib][1] = 0x0080001b | (attrib << 9);
       r->vtxprog[attrib][2] = 0x0836106c;
@@ -259,7 +266,12 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
       r->vtxprog[attrib][3] = 0x6041ff80 | (result + vroute[sem].vp40) << 2;
    }
 
-   *idx = vroute[sem].ow40 << result;
+   if (result < 8)
+      *idx = vroute[sem].ow40 << result;
+   else {
+      assert(sem == TGSI_SEMANTIC_TEXCOORD);
+      *idx = 0x00001000 << (result - 8);
+   }
    return TRUE;
 }
 
@@ -313,7 +325,7 @@ nv30_render_validate(struct nv30_context *nv30)
 
    while (pntc && attrib < 16) {
       uint index = ffs(pntc) - 1; pntc &= ~(1 << index);
-      if (vroute_add(r, attrib, TGSI_SEMANTIC_GENERIC, &index)) {
+      if (vroute_add(r, attrib, TGSI_SEMANTIC_TEXCOORD, &index)) {
          vp_attribs |= (1 << attrib++);
          vp_results |= index;
       }
@@ -398,17 +410,17 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (nv30->vertprog.constbuf) {
          void *map = nv04_resource(nv30->vertprog.constbuf)->data;
          draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
-                                         map, nv30->vertprog.constbuf_nr);
+                                         map, nv30->vertprog.constbuf_nr * 16);
+      } else {
+         draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, NULL, 0);
       }
    }
 
    for (i = 0; i < nv30->num_vtxbufs; i++) {
       const void *map = nv30->vtxbuf[i].user_buffer;
       if (!map) {
-         if (!nv30->vtxbuf[i].buffer) {
-            continue;
-         }
-         map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
+         if (nv30->vtxbuf[i].buffer)
+            map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
                                   PIPE_TRANSFER_UNSYNCHRONIZED |
                                   PIPE_TRANSFER_READ, &transfer[i]);
       }
@@ -418,9 +430,9 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (info->indexed) {
       const void *map = nv30->idxbuf.user_buffer;
       if (!map)
-         pipe_buffer_map(pipe, nv30->idxbuf.buffer,
-                                  PIPE_TRANSFER_UNSYNCHRONIZED |
-                                  PIPE_TRANSFER_READ, &transferi);
+         map = pipe_buffer_map(pipe, nv30->idxbuf.buffer,
+                               PIPE_TRANSFER_UNSYNCHRONIZED |
+                               PIPE_TRANSFER_READ, &transferi);
       draw_set_indexes(draw,
                        (ubyte *) map + nv30->idxbuf.offset,
                        nv30->idxbuf.index_size, ~0);
@@ -444,6 +456,12 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 static void
 nv30_render_destroy(struct vbuf_render *render)
 {
+   struct nv30_render *r = nv30_render(render);
+
+   if (r->transfer)
+      pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer);
+   pipe_resource_reference(&r->buffer, NULL);
+   nouveau_heap_free(&r->vertprog);
    FREE(render);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index a05bfe10ee9..7f227868f73 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -23,6 +23,7 @@
  *
  */
 
+#include "draw/draw_context.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "nv_object.xml.h"
@@ -147,8 +148,12 @@ nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 
    pipe_resource_reference(&fp->buffer, NULL);
 
+   if (fp->draw)
+      draw_delete_fragment_shader(nv30_context(pipe)->draw, fp->draw);
+
    FREE((void *)fp->pipe.tokens);
    FREE(fp->insn);
+   FREE(fp->consts);
    FREE(fp);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index eeb714864e2..2e38a1978ae 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -161,6 +161,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -251,6 +252,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
@@ -291,6 +293,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
@@ -523,7 +526,7 @@ nv30_screen_create(struct nouveau_device *dev)
 
    ret = nouveau_bo_wrap(screen->base.device, fifo->notify, &screen->notify);
    if (ret == 0)
-      nouveau_bo_map(screen->notify, 0, screen->base.client);
+      ret = nouveau_bo_map(screen->notify, 0, screen->base.client);
    if (ret)
       FAIL_SCREEN_INIT("error mapping notifier memory: %d\n", ret);
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
index 0f9d19dd68e..a954dcce562 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -272,15 +272,13 @@ nv30_validate_clip(struct nv30_context *nv30)
    uint32_t clpd_enable = 0;
 
    for (i = 0; i < 6; i++) {
-      if (nv30->rast->pipe.clip_plane_enable & (1 << i)) {
-         if (nv30->dirty & NV30_NEW_CLIP) {
-            BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
-            PUSH_DATA (push, i);
-            PUSH_DATAp(push, nv30->clip.ucp[i], 4);
-         }
-
-         clpd_enable |= 1 << (1 + 4*i);
+      if (nv30->dirty & NV30_NEW_CLIP) {
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
+         PUSH_DATA (push, i);
+         PUSH_DATAp(push, nv30->clip.ucp[i], 4);
       }
+      if (nv30->rast->pipe.clip_plane_enable & (1 << i))
+         clpd_enable |= 2 << (4*i);
    }
 
    BEGIN_NV04(push, NV30_3D(VP_CLIP_PLANES_ENABLE), 1);
@@ -389,7 +387,7 @@ static struct state_validate hwtnl_validate_list[] = {
     { nv30_validate_stipple,       NV30_NEW_STIPPLE },
     { nv30_validate_scissor,       NV30_NEW_SCISSOR | NV30_NEW_RASTERIZER },
     { nv30_validate_viewport,      NV30_NEW_VIEWPORT },
-    { nv30_validate_clip,          NV30_NEW_CLIP },
+    { nv30_validate_clip,          NV30_NEW_CLIP | NV30_NEW_RASTERIZER },
     { nv30_fragprog_validate,      NV30_NEW_FRAGPROG | NV30_NEW_FRAGCONST },
     { nv30_vertprog_validate,      NV30_NEW_VERTPROG | NV30_NEW_VERTCONST |
                                    NV30_NEW_FRAGPROG | NV30_NEW_RASTERIZER },
@@ -456,7 +454,7 @@ nv30_state_context_switch(struct nv30_context *nv30)
 }
 
 boolean
-nv30_state_validate(struct nv30_context *nv30, boolean hwtnl)
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
 {
    struct nouveau_screen *screen = &nv30->screen->base;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
@@ -481,14 +479,16 @@ nv30_state_validate(struct nv30_context *nv30, boolean hwtnl)
    else
       validate = swtnl_validate_list;
 
-   if (nv30->dirty) {
+   mask &= nv30->dirty;
+
+   if (mask) {
       while (validate->func) {
-         if (nv30->dirty & validate->mask)
+         if (mask & validate->mask)
             validate->func(nv30);
          validate++;
       }
 
-      nv30->dirty = 0;
+      nv30->dirty &= ~mask;
    }
 
    nouveau_pushbuf_bufctx(push, bctx);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index 67ab8295218..d4e384b21d2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -564,7 +564,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
       nv30_update_user_vbufs(nv30);
 
-   nv30_state_validate(nv30, TRUE);
+   nv30_state_validate(nv30, ~0, TRUE);
    if (nv30->draw_flags) {
       nv30_render_vbo(pipe, info);
       return;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
index 3c1b7e714ea..4d4145d10b5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -23,6 +23,7 @@
  *
  */
 
+#include "draw/draw_context.h"
 #include "util/u_dynarray.h"
 #include "tgsi/tgsi_parse.h"
 
@@ -237,6 +238,10 @@ nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 
    if (vp->translated)
       nv30_vertprog_destroy(vp);
+
+   if (vp->draw)
+      draw_delete_vertex_shader(nv30_context(pipe)->draw, vp->draw);
+
    FREE((void *)vp->pipe.tokens);
    FREE(vp);
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index bbdca8102f0..9ef16965f39 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -327,6 +327,8 @@ nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 }
 
+#if 0
+/* documentation only */
 /* warning: this only works forward, and probably only if not inside any IF */
 static void
 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
@@ -352,6 +354,7 @@ nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
         reloc.location = fpc->inst_offset + 3;
         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 }
+#endif
 
 static void
 nv40_fp_brk(struct nvfx_fpc *fpc)
@@ -528,7 +531,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
 
    dst  = tgsi_dst(fpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
-   sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+   sat  = finst->Instruction.Saturate;
 
    switch (finst->Instruction.Opcode) {
    case TGSI_OPCODE_ABS:
@@ -1201,17 +1204,3 @@ out_err:
    tgsi_dump(fp->pipe.tokens, 0);
    goto out;
 }
-
-static inline void
-nvfx_fp_memcpy(void* dst, const void* src, size_t len)
-{
-#ifndef PIPE_ARCH_BIG_ENDIAN
-   memcpy(dst, src, len);
-#else
-   size_t i;
-   for(i = 0; i < len; i += 4) {
-      uint32_t v = *(uint32_t*)((char*)src + i);
-      *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
-   }
-#endif
-}
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 29d506b6e9b..1ce0589be71 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -539,7 +539,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
 
    final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
-   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) {
+   if(finst->Instruction.Saturate) {
       assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
       if (vpc->is_nv4x)
          sat = TRUE;
@@ -796,7 +796,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       return FALSE;
    }
 
-   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE && !vpc->is_nv4x) {
+   if(finst->Instruction.Saturate && !vpc->is_nv4x) {
       if (!vpc->r_0_1.type)
          vpc->r_0_1 = constant(vpc, -1, 0, 1, 0, 0);
       nvfx_vp_emit(vpc, arith(0, VEC, MAX, dst, mask, nvfx_src(dst), swz(nvfx_src(vpc->r_0_1), X, X, X, X), none));
@@ -872,9 +872,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
       }
       break;
    case TGSI_SEMANTIC_EDGEFLAG:
-      /* not really an error just a fallback */
-      NOUVEAU_ERR("cannot handle edgeflag output\n");
-      return FALSE;
+      vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
+      return TRUE;
    default:
       NOUVEAU_ERR("bad output semantic\n");
       return FALSE;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 2cfd5db5ea0..5b5d3912c20 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -138,8 +138,11 @@ nv50_destroy(struct pipe_context *pipe)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
 
-   if (nv50_context_screen(nv50)->cur_ctx == nv50)
-      nv50_context_screen(nv50)->cur_ctx = NULL;
+   if (nv50->screen->cur_ctx == nv50) {
+      nv50->screen->cur_ctx = NULL;
+      /* Save off the state in case another context gets created */
+      nv50->screen->save_state = nv50->state;
+   }
    nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL);
    nouveau_pushbuf_kick(nv50->base.pushbuf, nv50->base.pushbuf->channel);
 
@@ -290,6 +293,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv)
    pipe->get_sample_position = nv50_context_get_sample_position;
 
    if (!screen->cur_ctx) {
+      /* Restore the last context's state here, normally handled during
+       * context switch
+       */
+      nv50->state = screen->save_state;
       screen->cur_ctx = nv50;
       nouveau_pushbuf_bufctx(screen->base.pushbuf, nv50->bufctx);
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 45eb554eb4f..1f123ef7e92 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -104,28 +104,7 @@ struct nv50_context {
    uint32_t dirty;
    boolean cb_dirty;
 
-   struct {
-      uint32_t instance_elts; /* bitmask of per-instance elements */
-      uint32_t instance_base;
-      uint32_t interpolant_ctrl;
-      uint32_t semantic_color;
-      uint32_t semantic_psize;
-      int32_t index_bias;
-      boolean uniform_buffer_bound[3];
-      boolean prim_restart;
-      boolean point_sprite;
-      boolean rt_serialize;
-      boolean flushed;
-      boolean rasterizer_discard;
-      uint8_t tls_required;
-      boolean new_tls_space;
-      uint8_t num_vtxbufs;
-      uint8_t num_vtxelts;
-      uint8_t num_textures[3];
-      uint8_t num_samplers[3];
-      uint8_t prim_size;
-      uint16_t scissor;
-   } state;
+   struct nv50_graph_state state;
 
    struct nv50_blend_stateobj *blend;
    struct nv50_rasterizer_stateobj *rast;
@@ -191,12 +170,6 @@ nv50_context(struct pipe_context *pipe)
    return (struct nv50_context *)pipe;
 }
 
-static INLINE struct nv50_screen *
-nv50_context_screen(struct nv50_context *nv50)
-{
-   return nv50_screen(&nv50->base.screen->base);
-}
-
 /* return index used in nv50_context arrays for a specific shader type */
 static INLINE unsigned
 nv50_context_shader_stage(unsigned pipe)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index 744a3a5bf8b..f15d8f3ecb6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -377,7 +377,7 @@ nv50_miptree_create(struct pipe_screen *pscreen,
    if (!bo_config.nv50.memtype && (pt->bind & PIPE_BIND_SHARED))
       mt->base.domain = NOUVEAU_BO_GART;
    else
-      mt->base.domain = NOUVEAU_BO_VRAM;
+      mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen));
 
    bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP;
    if (mt->base.base.bind & (PIPE_BIND_CURSOR | PIPE_BIND_DISPLAY_TARGET))
@@ -419,7 +419,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
       FREE(mt);
       return NULL;
    }
-   mt->base.domain = NOUVEAU_BO_VRAM;
+   mt->base.domain = mt->base.bo->flags & NOUVEAU_BO_APER;
    mt->base.address = mt->base.bo->offset;
 
    mt->base.base = *templ;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 6690aa282eb..81f7474e36b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -27,6 +27,11 @@
 #include "nv50/nv50_context.h"
 #include "nv_object.xml.h"
 
+#define NV50_QUERY_STATE_READY   0
+#define NV50_QUERY_STATE_ACTIVE  1
+#define NV50_QUERY_STATE_ENDED   2
+#define NV50_QUERY_STATE_FLUSHED 3
+
 /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
  * (since we use only a single GPU channel per screen) will not work properly.
  *
@@ -42,10 +47,10 @@ struct nv50_query {
    struct nouveau_bo *bo;
    uint32_t base;
    uint32_t offset; /* base + i * 32 */
-   boolean ready;
-   boolean flushed;
+   uint8_t state;
    boolean is64bit;
    struct nouveau_mm_allocation *mm;
+   struct nouveau_fence *fence;
 };
 
 #define NV50_QUERY_ALLOC_SPACE 256
@@ -65,7 +70,7 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
    if (q->bo) {
       nouveau_bo_ref(NULL, &q->bo);
       if (q->mm) {
-         if (q->ready)
+         if (q->state == NV50_QUERY_STATE_READY)
             nouveau_mm_free(q->mm);
          else
             nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work,
@@ -92,6 +97,7 @@ static void
 nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 {
    nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
+   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
    FREE(nv50_query(pq));
 }
 
@@ -112,7 +118,8 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
 
    q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED ||
                  type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-                 type == PIPE_QUERY_SO_STATISTICS);
+                 type == PIPE_QUERY_SO_STATISTICS ||
+                 type == PIPE_QUERY_PIPELINE_STATISTICS);
    q->type = type;
 
    if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
@@ -200,7 +207,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    default:
       break;
    }
-   q->ready = FALSE;
+   q->state = NV50_QUERY_STATE_ACTIVE;
    return true;
 }
 
@@ -211,6 +218,8 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q = nv50_query(pq);
 
+   q->state = NV50_QUERY_STATE_ENDED;
+
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       nv50_query_get(push, q, 0, 0x0100f002);
@@ -253,19 +262,27 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       /* This query is not issued on GPU because disjoint is forced to FALSE */
-      q->ready = TRUE;
+      q->state = NV50_QUERY_STATE_READY;
       break;
    default:
       assert(0);
       break;
    }
-   q->ready = q->flushed = FALSE;
+
+   if (q->is64bit)
+      nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
 }
 
-static INLINE boolean
-nv50_query_ready(struct nv50_query *q)
+static INLINE void
+nv50_query_update(struct nv50_query *q)
 {
-   return q->ready || (!q->is64bit && (q->data[0] == q->sequence));
+   if (q->is64bit) {
+      if (nouveau_fence_signalled(q->fence))
+         q->state = NV50_QUERY_STATE_READY;
+   } else {
+      if (q->data[0] == q->sequence)
+         q->state = NV50_QUERY_STATE_READY;
+   }
 }
 
 static boolean
@@ -280,13 +297,14 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    uint64_t *data64 = (uint64_t *)q->data;
    int i;
 
-   if (!q->ready) /* update ? */
-      q->ready = nv50_query_ready(q);
-   if (!q->ready) {
+   if (q->state != NV50_QUERY_STATE_READY)
+      nv50_query_update(q);
+
+   if (q->state != NV50_QUERY_STATE_READY) {
       if (!wait) {
          /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
-         if (!q->flushed) {
-            q->flushed = TRUE;
+         if (q->state != NV50_QUERY_STATE_FLUSHED) {
+            q->state = NV50_QUERY_STATE_FLUSHED;
             PUSH_KICK(nv50->base.pushbuf);
          }
          return FALSE;
@@ -294,7 +312,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
          return FALSE;
    }
-   q->ready = TRUE;
+   q->state = NV50_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
@@ -434,6 +452,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
    /* XXX: does this exist ? */
 #define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
 
+   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
                         NV50_IB_ENTRY_1_NO_PREFETCH);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 829dfbc13fa..6583a353578 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -209,6 +209,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -290,6 +291,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index f8ce365135a..881051b1862 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -25,10 +25,34 @@ struct nv50_context;
 
 struct nv50_blitter;
 
+struct nv50_graph_state {
+   uint32_t instance_elts; /* bitmask of per-instance elements */
+   uint32_t instance_base;
+   uint32_t interpolant_ctrl;
+   uint32_t semantic_color;
+   uint32_t semantic_psize;
+   int32_t index_bias;
+   boolean uniform_buffer_bound[3];
+   boolean prim_restart;
+   boolean point_sprite;
+   boolean rt_serialize;
+   boolean flushed;
+   boolean rasterizer_discard;
+   uint8_t tls_required;
+   boolean new_tls_space;
+   uint8_t num_vtxbufs;
+   uint8_t num_vtxelts;
+   uint8_t num_textures[3];
+   uint8_t num_samplers[3];
+   uint8_t prim_size;
+   uint16_t scissor;
+};
+
 struct nv50_screen {
    struct nouveau_screen base;
 
    struct nv50_context *cur_ctx;
+   struct nv50_graph_state save_state;
 
    struct nouveau_bo *code;
    struct nouveau_bo *uniforms;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 290750459cf..d4d41af3c61 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -811,12 +811,12 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
    if (nv50->constbuf[s][i].user) {
       nv50->constbuf[s][i].u.data = cb->user_buffer;
-      nv50->constbuf[s][i].size = cb->buffer_size;
+      nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
    } else
    if (res) {
       nv50->constbuf[s][i].offset = cb->buffer_offset;
-      nv50->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+      nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
    } else {
       nv50->constbuf_valid[s] &= ~(1 << i);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 85e19b4c623..116bf4bba7c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -394,6 +394,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
 
    if (ctx_from)
       ctx_to->state = ctx_from->state;
+   else
+      ctx_to->state = ctx_to->screen->save_state;
 
    ctx_to->dirty = ~0;
    ctx_to->viewports_dirty = ~0;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index c1590eefe9f..1fd33b8aa59 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -628,6 +628,7 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
          PUSH_DATA (push, prim);
 
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
          nouveau_pushbuf_space(push, 8, 0, 1);
 
          switch (index_size) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index ad287a2af6b..56fc83d3679 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -57,7 +57,7 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 1 << 12, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
                         &screen->parm);
    if (ret)
       return ret;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 7662fb50f61..a35c3f66142 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -139,8 +139,12 @@ nvc0_destroy(struct pipe_context *pipe)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
 
-   if (nvc0->screen->cur_ctx == nvc0)
+   if (nvc0->screen->cur_ctx == nvc0) {
       nvc0->screen->cur_ctx = NULL;
+      nvc0->screen->save_state = nvc0->state;
+      nvc0->screen->save_state.tfb = NULL;
+   }
+
    /* Unset bufctx, we don't want to revalidate any resources after the flush.
     * Other contexts will always set their bufctx again on action calls.
     */
@@ -303,6 +307,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
    if (!screen->cur_ctx) {
+      nvc0->state = screen->save_state;
       screen->cur_ctx = nvc0;
       nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
    }
@@ -324,7 +329,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
 
    /* add permanently resident buffers to bufctxts */
 
-   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
+   flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD;
 
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text);
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo);
@@ -335,7 +340,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
       BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm);
    }
 
-   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+   flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RDWR;
 
    if (screen->poly_cache)
       BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index ef251f35a1b..a8d7593b398 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -113,29 +113,7 @@ struct nvc0_context {
    uint32_t dirty;
    uint32_t dirty_cp; /* dirty flags for compute state */
 
-   struct {
-      boolean flushed;
-      boolean rasterizer_discard;
-      boolean early_z_forced;
-      boolean prim_restart;
-      uint32_t instance_elts; /* bitmask of per-instance elements */
-      uint32_t instance_base;
-      uint32_t constant_vbos;
-      uint32_t constant_elts;
-      int32_t index_bias;
-      uint16_t scissor;
-      uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
-      uint8_t num_vtxbufs;
-      uint8_t num_vtxelts;
-      uint8_t num_textures[6];
-      uint8_t num_samplers[6];
-      uint8_t tls_required; /* bitmask of shader types using l[] */
-      uint8_t c14_bound; /* whether immediate array constbuf is bound */
-      uint8_t clip_enable;
-      uint32_t clip_mode;
-      uint32_t uniform_buffer_bound[5];
-      struct nvc0_transform_feedback_state *tfb;
-   } state;
+   struct nvc0_graph_state state;
 
    struct nvc0_blend_stateobj *blend;
    struct nvc0_rasterizer_stateobj *rast;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index fc75fc6a4a1..3875bbf4ca4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -302,7 +302,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
    if (!bo_config.nvc0.memtype && (pt->usage == PIPE_USAGE_STAGING || pt->bind & PIPE_BIND_SHARED))
       mt->base.domain = NOUVEAU_BO_GART;
    else
-      mt->base.domain = NOUVEAU_BO_VRAM;
+      mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen));
 
    bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index c156e918dc5..e1f5a8c4416 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -392,7 +392,7 @@ nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
       break;
    }
 
-   gp->hdr[4] = info->prop.gp.maxVertices & 0x1ff;
+   gp->hdr[4] = MIN2(info->prop.gp.maxVertices, 1024);
 
    return nvc0_vtgp_gen_header(gp, info);
 }
@@ -683,11 +683,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
    if (ret) {
       struct nouveau_heap *heap = screen->text_heap;
-      struct nouveau_heap *iter;
-      for (iter = heap; iter && iter->next != heap; iter = iter->next) {
-         struct nvc0_program *evict = iter->priv;
-         if (evict)
-            nouveau_heap_free(&evict->mem);
+      /* Note that the code library, which is allocated before anything else,
+       * does not have a priv pointer. We can stop once we hit it.
+       */
+      while (heap->next && heap->next->priv) {
+         struct nvc0_program *evict = heap->next->priv;
+         nouveau_heap_free(&evict->mem);
       }
       debug_printf("WARNING: out of code space, evicting all shaders.\n");
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
@@ -734,12 +735,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (!is_cp)
       nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
-                           NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr);
+                           NV_VRAM_DOMAIN(&screen->base), NVC0_SHADER_HEADER_SIZE, prog->hdr);
    nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
-                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
+                        NV_VRAM_DOMAIN(&screen->base), prog->code_size, prog->code);
    if (prog->immd_size)
       nvc0->base.push_data(&nvc0->base,
-                           screen->text, prog->immd_base, NOUVEAU_BO_VRAM,
+                           screen->text, prog->immd_base, NV_VRAM_DOMAIN(&screen->base),
                            prog->immd_size, prog->immd_data);
 
    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
@@ -770,7 +771,7 @@ nvc0_program_library_upload(struct nvc0_context *nvc0)
       return;
 
    nvc0->base.push_data(&nvc0->base,
-                        screen->text, screen->lib_code->start, NOUVEAU_BO_VRAM,
+                        screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base),
                         size, code);
    /* no need for a memory barrier, will be emitted with first program */
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 52032eb6f83..aea6cbda02d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -617,6 +617,7 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
+   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
@@ -1407,11 +1408,14 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    count += NVC0_QUERY_DRV_STAT_COUNT;
 
    if (screen->base.device->drm_version >= 0x01000101) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         count += NVE4_PM_QUERY_COUNT;
-      } else
       if (screen->compute) {
-         count += NVC0_PM_QUERY_COUNT; /* NVC0_COMPUTE is not always enabled */
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            count += NVE4_PM_QUERY_COUNT;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            /* NVC0_COMPUTE is not always enabled */
+            count += NVC0_PM_QUERY_COUNT;
+         }
       }
    }
 
@@ -1437,19 +1441,21 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    } else
 #endif
    if (id < count) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-         info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-         info->max_value.u64 =
-            (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
-         info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-         return 1;
-      } else
       if (screen->compute) {
-         info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-         info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-         info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-         return 1;
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+            info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->max_value.u64 =
+               (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
+            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
+            return 1;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+            info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
+            return 1;
+         }
       }
    }
    /* user asked for info about non-existing query */
@@ -1469,10 +1475,13 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
 #endif
 
    if (screen->base.device->drm_version >= 0x01000101) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         count++;
-      } else if (screen->compute) {
-         count++; /* NVC0_COMPUTE is not always enabled */
+      if (screen->compute) {
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            count++;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            count++; /* NVC0_COMPUTE is not always enabled */
+         }
       }
    }
 
@@ -1480,25 +1489,28 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
       return count;
 
    if (id == NVC0_QUERY_MP_COUNTER_GROUP) {
-      info->name = "MP counters";
-      info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
-
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         info->num_queries = NVE4_PM_QUERY_COUNT;
-
-          /* On NVE4+, each multiprocessor have 8 hardware counters separated
-           * in two distinct domains, but we allow only one active query
-           * simultaneously because some of them use more than one hardware
-           * counter and this will result in an undefined behaviour. */
-          info->max_active_queries = 1; /* TODO: handle multiple hw counters */
-          return 1;
-      } else if (screen->compute) {
-         info->num_queries = NVC0_PM_QUERY_COUNT;
-
-         /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
-          * in a single domain. */
-         info->max_active_queries = 8;
-         return 1;
+      if (screen->compute) {
+         info->name = "MP counters";
+         info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
+
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            info->num_queries = NVE4_PM_QUERY_COUNT;
+
+             /* On NVE4+, each multiprocessor have 8 hardware counters separated
+              * in two distinct domains, but we allow only one active query
+              * simultaneously because some of them use more than one hardware
+              * counter and this will result in an undefined behaviour. */
+             info->max_active_queries = 1; /* TODO: handle multiple hw counters */
+             return 1;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->num_queries = NVC0_PM_QUERY_COUNT;
+
+            /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
+             * in a single domain. */
+            info->max_active_queries = 8;
+            return 1;
+         }
       }
    }
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 748c9e7c8b9..56c230e42fc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -193,6 +193,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -296,6 +297,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       return 1;
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
@@ -581,7 +583,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 
    size = align(size, 1 << 17);
 
-   ret = nouveau_bo_new(screen->base.device, NOUVEAU_BO_VRAM, 1 << 17, size,
+   ret = nouveau_bo_new(screen->base.device, NV_VRAM_DOMAIN(&screen->base), 1 << 17, size,
                         NULL, &bo);
    if (ret) {
       NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
@@ -644,6 +646,11 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.sysmem_bindings |=
       PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
 
+   if (screen->base.vram_domain & NOUVEAU_BO_GART) {
+      screen->base.sysmem_bindings |= screen->base.vidmem_bindings;
+      screen->base.vidmem_bindings = 0;
+   }
+
    pscreen->destroy = nvc0_screen_destroy;
    pscreen->context_create = nvc0_create;
    pscreen->is_format_supported = nvc0_screen_is_format_supported;
@@ -822,7 +829,7 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    nvc0_magic_3d_init(push, screen->eng3d->oclass);
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
                         &screen->text);
    if (ret)
       goto fail;
@@ -832,12 +839,12 @@ nvc0_screen_create(struct nouveau_device *dev)
     */
    nouveau_heap_init(&screen->text_heap, 0, (1 << 20) - 0x100);
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 12, 6 << 16, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 12, 6 << 16, NULL,
                         &screen->uniform_bo);
    if (ret)
       goto fail;
 
-   PUSH_REFN (push, screen->uniform_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+   PUSH_REFN (push, screen->uniform_bo, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_WR);
 
    for (i = 0; i < 5; ++i) {
       /* TIC and TSC entries for each unit (nve4+ only) */
@@ -908,7 +915,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 0);
 
    if (screen->eng3d->oclass < GM107_3D_CLASS) {
-      ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+      ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
                            &screen->poly_cache);
       if (ret)
          goto fail;
@@ -919,7 +926,7 @@ nvc0_screen_create(struct nouveau_device *dev)
       PUSH_DATA (push, 3);
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 17, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 17, NULL,
                         &screen->txc);
    if (ret)
       goto fail;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 1a7d5027a7c..ef2bd43f006 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -27,10 +27,35 @@ struct nvc0_context;
 
 struct nvc0_blitter;
 
+struct nvc0_graph_state {
+   boolean flushed;
+   boolean rasterizer_discard;
+   boolean early_z_forced;
+   boolean prim_restart;
+   uint32_t instance_elts; /* bitmask of per-instance elements */
+   uint32_t instance_base;
+   uint32_t constant_vbos;
+   uint32_t constant_elts;
+   int32_t index_bias;
+   uint16_t scissor;
+   uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
+   uint8_t num_vtxbufs;
+   uint8_t num_vtxelts;
+   uint8_t num_textures[6];
+   uint8_t num_samplers[6];
+   uint8_t tls_required; /* bitmask of shader types using l[] */
+   uint8_t c14_bound; /* whether immediate array constbuf is bound */
+   uint8_t clip_enable;
+   uint32_t clip_mode;
+   uint32_t uniform_buffer_bound[5];
+   struct nvc0_transform_feedback_state *tfb;
+};
+
 struct nvc0_screen {
    struct nouveau_screen base;
 
    struct nvc0_context *cur_ctx;
+   struct nvc0_graph_state save_state;
 
    int num_occlusion_queries_active;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 516b33b76d5..e0842784a88 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -34,7 +34,7 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
    if (prog && prog->need_tls) {
-      const uint32_t flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+      const uint32_t flags = NV_VRAM_DOMAIN(&nvc0->screen->base) | NOUVEAU_BO_RDWR;
       if (!nvc0->state.tls_required)
          BCTX_REFN_bo(nvc0->bufctx_3d, TLS, flags, nvc0->screen->tls);
       nvc0->state.tls_required |= 1 << stage;
@@ -262,11 +262,13 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
       if (tfb)
          targ->stride = tfb->stride[b];
 
+      buf = nv04_resource(targ->pipe.buffer);
+
+      BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR);
+
       if (!(nvc0->tfbbuf_dirty & (1 << b)))
          continue;
 
-      buf = nv04_resource(targ->pipe.buffer);
-
       if (!targ->clean)
          nvc0_query_fifo_wait(push, targ->pq);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
@@ -280,7 +282,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
          targ->clean = FALSE;
       }
-      BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR);
    }
    for (; b < 4; ++b)
       IMMED_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index dca06f4cddb..6b7a211e71b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -413,24 +413,6 @@ nvc0_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
 #define NV50_TSC_WRAP_CASE(n) \
     case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
 
-static INLINE unsigned
-nv50_tsc_wrap_mode(unsigned wrap)
-{
-   switch (wrap) {
-   NV50_TSC_WRAP_CASE(REPEAT);
-   NV50_TSC_WRAP_CASE(MIRROR_REPEAT);
-   NV50_TSC_WRAP_CASE(CLAMP_TO_EDGE);
-   NV50_TSC_WRAP_CASE(CLAMP_TO_BORDER);
-   NV50_TSC_WRAP_CASE(CLAMP);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_EDGE);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_BORDER);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP);
-   default:
-       NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
-       return NV50_TSC_WRAP_REPEAT;
-   }
-}
-
 static void
 nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
@@ -811,12 +793,12 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
    if (nvc0->constbuf[s][i].user) {
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
-      nvc0->constbuf[s][i].size = cb->buffer_size;
+      nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
    } else
    if (cb) {
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
-      nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+      nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
    }
    else {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 6051f128f66..c52399ab312 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -439,7 +439,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
                BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
                PUSH_DATA (push, (0 << 4) | 1);
             }
-            nvc0_cb_push(&nvc0->base, bo, NOUVEAU_BO_VRAM,
+            nvc0_cb_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
                          base, nvc0->state.uniform_buffer_bound[s],
                          0, (size + 3) / 4,
                          nvc0->constbuf[s][0].u.data);
@@ -543,6 +543,8 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 
    if (ctx_from)
       ctx_to->state = ctx_from->state;
+   else
+      ctx_to->state = ctx_to->screen->save_state;
 
    ctx_to->dirty = ~0;
    ctx_to->viewports_dirty = ~0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 4404d8c1a74..a820de7259a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1152,6 +1152,12 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
    }
+   if (nvc0->state.instance_elts) {
+      nvc0->state.instance_elts = 0;
+      BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
+      PUSH_DATA (push, n);
+      PUSH_DATA (push, 0);
+   }
    nvc0->state.num_vtxelts = 2;
 
    for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 457f27c8311..ddc0409ca86 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -396,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
          tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc);
 
          nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
-                               65536 + tsc->id * 32, NOUVEAU_BO_VRAM,
+                               65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base),
                                32, tsc->tsc);
          need_flush = TRUE;
       }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 657b8c0fe82..8cf2584b0ce 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -829,6 +829,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    }
    PUSH_DATA(push, nvc0_prim_gl(info->mode));
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
+   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push,
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index f243316b899..fce02a7cc57 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -63,7 +63,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
                         &screen->parm);
    if (ret)
       return ret;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a7b59d8bfbb..a7bca915f57 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -190,6 +190,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_POLYGON_OFFSET_CLAMP:
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
             return 0;
 
         /* SWTCL-only features. */
@@ -273,6 +274,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
             return (is_r500 ? 256 : 32) * sizeof(float[4]);
         case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
             return 1;
         case PIPE_SHADER_CAP_MAX_TEMPS:
             return is_r500 ? 128 : is_r400 ? 64 : 32;
@@ -332,6 +334,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_MAX_PREDS:
             return 0; /* unused */
         case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
             return 1;
         case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
         case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 69afb4caeaa..23ed2cf2532 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -133,13 +133,7 @@ static unsigned translate_opcode(unsigned opcode)
 
 static unsigned translate_saturate(unsigned saturate)
 {
-    switch(saturate) {
-        default:
-            fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
-            /* fall-through */
-        case TGSI_SAT_NONE: return RC_SATURATE_NONE;
-        case TGSI_SAT_ZERO_ONE: return RC_SATURATE_ZERO_ONE;
-    }
+    return saturate ? RC_SATURATE_ZERO_ONE : RC_SATURATE_NONE;
 }
 
 static unsigned translate_register_file(unsigned file)
diff --git a/src/gallium/drivers/r600/Android.mk b/src/gallium/drivers/r600/Android.mk
index e9357597a9b..bfe39873089 100644
--- a/src/gallium/drivers/r600/Android.mk
+++ b/src/gallium/drivers/r600/Android.mk
@@ -33,6 +33,10 @@ LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES)
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_r600
 
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_C_INCLUDES := external/libcxx/include
+else
 include external/stlport/libstlport.mk
+endif
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 21e5d42adc3..e122b607b86 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -332,6 +332,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	/* Stream output. */
@@ -475,6 +476,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_SUBROUTINES:
 		return 0;
 	case PIPE_SHADER_CAP_INTEGERS:
+	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 87b6e6e06ec..af7622e9b34 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -617,98 +617,100 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 
 	switch (d->Declaration.File) {
 	case TGSI_FILE_INPUT:
-		i = ctx->shader->ninput;
-                assert(i < Elements(ctx->shader->input));
-		ctx->shader->ninput += count;
-		ctx->shader->input[i].name = d->Semantic.Name;
-		ctx->shader->input[i].sid = d->Semantic.Index;
-		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
-		ctx->shader->input[i].interpolate_location = d->Interp.Location;
-		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
-			switch (ctx->shader->input[i].name) {
-			case TGSI_SEMANTIC_FACE:
-				if (ctx->face_gpr != -1)
-					ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
-				else
-					ctx->face_gpr = ctx->shader->input[i].gpr;
-				break;
-			case TGSI_SEMANTIC_COLOR:
-				ctx->colors_used++;
-				break;
-			case TGSI_SEMANTIC_POSITION:
-				ctx->fragcoord_input = i;
-				break;
-			case TGSI_SEMANTIC_PRIMID:
-				/* set this for now */
-				ctx->shader->gs_prim_id_input = true;
-				ctx->shader->ps_prim_id_input = i;
-				break;
-			}
-			if (ctx->bc->chip_class >= EVERGREEN) {
-				if ((r = evergreen_interp_input(ctx, i)))
-					return r;
+		for (j = 0; j < count; j++) {
+			i = ctx->shader->ninput + j;
+			assert(i < Elements(ctx->shader->input));
+			ctx->shader->input[i].name = d->Semantic.Name;
+			ctx->shader->input[i].sid = d->Semantic.Index + j;
+			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
+			ctx->shader->input[i].interpolate_location = d->Interp.Location;
+			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
+			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
+				switch (ctx->shader->input[i].name) {
+				case TGSI_SEMANTIC_FACE:
+					if (ctx->face_gpr != -1)
+						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
+					else
+						ctx->face_gpr = ctx->shader->input[i].gpr;
+					break;
+				case TGSI_SEMANTIC_COLOR:
+					ctx->colors_used++;
+					break;
+				case TGSI_SEMANTIC_POSITION:
+					ctx->fragcoord_input = i;
+					break;
+				case TGSI_SEMANTIC_PRIMID:
+					/* set this for now */
+					ctx->shader->gs_prim_id_input = true;
+					ctx->shader->ps_prim_id_input = i;
+					break;
+				}
+				if (ctx->bc->chip_class >= EVERGREEN) {
+					if ((r = evergreen_interp_input(ctx, i)))
+						return r;
+				}
+			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+				/* FIXME probably skip inputs if they aren't passed in the ring */
+				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
+				ctx->next_ring_offset += 16;
+				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
+					ctx->shader->gs_prim_id_input = true;
 			}
-		} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-			/* FIXME probably skip inputs if they aren't passed in the ring */
-			ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
-			ctx->next_ring_offset += 16;
-			if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
-				ctx->shader->gs_prim_id_input = true;
-		}
-		for (j = 1; j < count; ++j) {
-			ctx->shader->input[i + j] = ctx->shader->input[i];
-			ctx->shader->input[i + j].gpr += j;
 		}
+		ctx->shader->ninput += count;
 		break;
 	case TGSI_FILE_OUTPUT:
-		i = ctx->shader->noutput++;
-                assert(i < Elements(ctx->shader->output));
-		ctx->shader->output[i].name = d->Semantic.Name;
-		ctx->shader->output[i].sid = d->Semantic.Index;
-		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
-		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
-		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
-		if (ctx->type == TGSI_PROCESSOR_VERTEX ||
-				ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-			ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
-			switch (d->Semantic.Name) {
-			case TGSI_SEMANTIC_CLIPDIST:
-				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
-				break;
-			case TGSI_SEMANTIC_PSIZE:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_point_size = 1;
-				break;
-			case TGSI_SEMANTIC_EDGEFLAG:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_edgeflag = 1;
-				ctx->edgeflag_output = i;
-				break;
-			case TGSI_SEMANTIC_VIEWPORT_INDEX:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_viewport = 1;
-				break;
-			case TGSI_SEMANTIC_LAYER:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_layer = 1;
-				break;
-			case TGSI_SEMANTIC_CLIPVERTEX:
-				ctx->clip_vertex_write = TRUE;
-				ctx->cv_output = i;
-				break;
-			}
-			if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-				ctx->gs_out_ring_offset += 16;
-			}
-		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			switch (d->Semantic.Name) {
-			case TGSI_SEMANTIC_COLOR:
-				ctx->shader->nr_ps_max_color_exports++;
-				break;
+		for (j = 0; j < count; j++) {
+			i = ctx->shader->noutput + j;
+			assert(i < Elements(ctx->shader->output));
+			ctx->shader->output[i].name = d->Semantic.Name;
+			ctx->shader->output[i].sid = d->Semantic.Index + j;
+			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
+			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
+			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
+			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
+			    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_CLIPDIST:
+					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
+									((d->Semantic.Index + j) << 2);
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_point_size = 1;
+					break;
+				case TGSI_SEMANTIC_EDGEFLAG:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_edgeflag = 1;
+					ctx->edgeflag_output = i;
+					break;
+				case TGSI_SEMANTIC_VIEWPORT_INDEX:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_viewport = 1;
+					break;
+				case TGSI_SEMANTIC_LAYER:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_layer = 1;
+					break;
+				case TGSI_SEMANTIC_CLIPVERTEX:
+					ctx->clip_vertex_write = TRUE;
+					ctx->cv_output = i;
+					break;
+				}
+				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+					ctx->gs_out_ring_offset += 16;
+				}
+			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_COLOR:
+					ctx->shader->nr_ps_max_color_exports++;
+					break;
+				}
 			}
 		}
+		ctx->shader->noutput += count;
 		break;
 	case TGSI_FILE_TEMPORARY:
 		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
@@ -723,6 +725,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 
 	case TGSI_FILE_CONSTANT:
 	case TGSI_FILE_SAMPLER:
+	case TGSI_FILE_SAMPLER_VIEW:
 	case TGSI_FILE_ADDRESS:
 		break;
 
@@ -1337,7 +1340,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 	int i, j, r;
 
 	/* Sanity checking. */
-	if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
+	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
 		r = -EINVAL;
 		goto out_err;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index c50c7055851..13dc9ee8c10 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -95,22 +95,23 @@ static void r600_texture_barrier(struct pipe_context *ctx)
 static unsigned r600_conv_pipe_prim(unsigned prim)
 {
 	static const unsigned prim_conv[] = {
-		V_008958_DI_PT_POINTLIST,
-		V_008958_DI_PT_LINELIST,
-		V_008958_DI_PT_LINELOOP,
-		V_008958_DI_PT_LINESTRIP,
-		V_008958_DI_PT_TRILIST,
-		V_008958_DI_PT_TRISTRIP,
-		V_008958_DI_PT_TRIFAN,
-		V_008958_DI_PT_QUADLIST,
-		V_008958_DI_PT_QUADSTRIP,
-		V_008958_DI_PT_POLYGON,
-		V_008958_DI_PT_LINELIST_ADJ,
-		V_008958_DI_PT_LINESTRIP_ADJ,
-		V_008958_DI_PT_TRILIST_ADJ,
-		V_008958_DI_PT_TRISTRIP_ADJ,
-		V_008958_DI_PT_RECTLIST
+		[PIPE_PRIM_POINTS]			= V_008958_DI_PT_POINTLIST,
+		[PIPE_PRIM_LINES]			= V_008958_DI_PT_LINELIST,
+		[PIPE_PRIM_LINE_LOOP]			= V_008958_DI_PT_LINELOOP,
+		[PIPE_PRIM_LINE_STRIP]			= V_008958_DI_PT_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES]			= V_008958_DI_PT_TRILIST,
+		[PIPE_PRIM_TRIANGLE_STRIP]		= V_008958_DI_PT_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_FAN]		= V_008958_DI_PT_TRIFAN,
+		[PIPE_PRIM_QUADS]			= V_008958_DI_PT_QUADLIST,
+		[PIPE_PRIM_QUAD_STRIP]			= V_008958_DI_PT_QUADSTRIP,
+		[PIPE_PRIM_POLYGON]			= V_008958_DI_PT_POLYGON,
+		[PIPE_PRIM_LINES_ADJACENCY]		= V_008958_DI_PT_LINELIST_ADJ,
+		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
+		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
+		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
+		[R600_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
 	};
+	assert(prim < Elements(prim_conv));
 	return prim_conv[prim];
 }
 
diff --git a/src/gallium/drivers/radeon/Android.mk b/src/gallium/drivers/radeon/Android.mk
index d61579280ea..6997a6d3ec3 100644
--- a/src/gallium/drivers/radeon/Android.mk
+++ b/src/gallium/drivers/radeon/Android.mk
@@ -30,6 +30,10 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_SRC_FILES += $(LLVM_C_FILES)
+endif
+
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeon
 
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index c655fe5787b..f63790c329e 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -12,6 +12,7 @@ C_SOURCES := \
 	radeon_uvd.c \
 	radeon_uvd.h \
 	radeon_vce_40_2_2.c \
+	radeon_vce_50.c \
 	radeon_vce.c \
 	radeon_vce.h \
 	radeon_video.c \
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 42e681dc7d2..3def4446882 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -107,11 +107,10 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
-	/* The number of dwords we already used in the DMA so far. */
-	num_dw += ctx->rings.dma.cs->cdw;
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) {
 		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS);
 	}
 }
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 8612ef8daf7..6a9557b0b73 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -33,7 +33,6 @@
 
 #define RADEON_LLVM_MAX_INPUTS 32 * 4
 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
-#define RADEON_LLVM_MAX_ARRAYS 16
 
 #define RADEON_LLVM_INITIAL_CF_DEPTH 4
 
@@ -130,8 +129,7 @@ struct radeon_llvm_context {
 	unsigned loop_depth;
 	unsigned loop_depth_max;
 
-	struct tgsi_declaration_range arrays[RADEON_LLVM_MAX_ARRAYS];
-	unsigned num_arrays;
+	struct tgsi_declaration_range *arrays;
 
 	LLVMValueRef main_fn;
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 624077c7465..25580b6bd4c 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -86,10 +86,18 @@ static void init_r600_target()
 {
 	static unsigned initialized = 0;
 	if (!initialized) {
+#if HAVE_LLVM < 0x0307
 		LLVMInitializeR600TargetInfo();
 		LLVMInitializeR600Target();
 		LLVMInitializeR600TargetMC();
 		LLVMInitializeR600AsmPrinter();
+#else
+		LLVMInitializeAMDGPUTargetInfo();
+		LLVMInitializeAMDGPUTarget();
+		LLVMInitializeAMDGPUTargetMC();
+		LLVMInitializeAMDGPUAsmPrinter();
+
+#endif
 		initialized = 1;
 	}
 }
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 20e506b7c5e..c8c980d9d32 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -85,8 +85,9 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
 		unsigned File, const struct tgsi_ind_register *reg)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+
 	if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
-            reg->ArrayID > RADEON_LLVM_MAX_ARRAYS) {
+	    reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
 		struct tgsi_declaration_range range;
 		range.First = 0;
 		range.Last = bld_base->info->file_max[File];
@@ -252,8 +253,14 @@ static void emit_declaration(
 	}
 
 	case TGSI_FILE_TEMPORARY:
-		if (decl->Declaration.Array && decl->Array.ArrayID <= RADEON_LLVM_MAX_ARRAYS)
+		if (decl->Declaration.Array) {
+			if (!ctx->arrays) {
+				int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+				ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
+			}
+
 			ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
+		}
 		if (uses_temp_indirect_addressing(bld_base)) {
 			lp_emit_declaration_soa(bld_base, decl);
 			break;
@@ -314,6 +321,21 @@ static void emit_declaration(
 	}
 }
 
+static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                         LLVMValueRef value)
+{
+	struct lp_build_emit_data clamp_emit_data;
+
+	memset(&clamp_emit_data, 0, sizeof(clamp_emit_data));
+	clamp_emit_data.arg_count = 3;
+	clamp_emit_data.args[0] = value;
+	clamp_emit_data.args[2] = bld_base->base.one;
+	clamp_emit_data.args[1] = bld_base->base.zero;
+
+	return lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP,
+				  &clamp_emit_data);
+}
+
 static void
 emit_store(
 	struct lp_build_tgsi_context * bld_base,
@@ -324,7 +346,6 @@ emit_store(
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
 	struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
-	struct lp_build_context base = bld->bld_base.base;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 	LLVMValueRef temp_ptr;
@@ -350,28 +371,8 @@ emit_store(
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 		LLVMValueRef value = dst[chan_index];
 
-		if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
-			struct lp_build_emit_data clamp_emit_data;
-
-			memset(&clamp_emit_data, 0, sizeof(clamp_emit_data));
-			clamp_emit_data.arg_count = 3;
-			clamp_emit_data.args[0] = value;
-			clamp_emit_data.args[2] = base.one;
-
-			switch(inst->Instruction.Saturate) {
-			case TGSI_SAT_ZERO_ONE:
-				clamp_emit_data.args[1] = base.zero;
-				break;
-			case TGSI_SAT_MINUS_PLUS_ONE:
-				clamp_emit_data.args[1] = LLVMConstReal(
-						base.elem_type, -1.0f);
-				break;
-			default:
-				assert(0);
-			}
-			value = lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP,
-						&clamp_emit_data);
-		}
+		if (inst->Instruction.Saturate)
+			value = radeon_llvm_saturate(bld_base, value);
 
 		if (reg->Register.File == TGSI_FILE_ADDRESS) {
 			temp_ptr = bld->addr[reg->Register.Index][chan_index];
@@ -1438,8 +1439,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	/* Allocate outputs */
 	ctx->soa.outputs = ctx->outputs;
 
-	ctx->num_arrays = 0;
-
 	/* XXX: Is there a better way to initialize all this ? */
 
 	lp_set_default_actions(bld_base);
@@ -1628,8 +1627,11 @@ void radeon_llvm_dispose(struct radeon_llvm_context * ctx)
 {
 	LLVMDisposeModule(ctx->soa.bld_base.base.gallivm->module);
 	LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context);
+	FREE(ctx->arrays);
+	ctx->arrays = NULL;
 	FREE(ctx->temps);
 	ctx->temps = NULL;
+	ctx->temps_count = 0;
 	FREE(ctx->loop);
 	ctx->loop = NULL;
 	ctx->loop_depth_max = 0;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index e220f40165b..a6567379fe3 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -44,6 +44,10 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
+#define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8))
+#define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8))
+#define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
+
 /**
  * flush commands to the hardware
  */
@@ -183,6 +187,44 @@ static unsigned get_cpb_num(struct rvce_encoder *enc)
 }
 
 /**
+ * Get the slot for the currently encoded frame
+ */
+struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list);
+}
+
+/**
+ * Get the slot for L0
+ */
+struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list);
+}
+
+/**
+ * Get the slot for L1
+ */
+struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list);
+}
+
+/**
+ * Calculate the offsets into the CPB
+ */
+void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
+		       unsigned *luma_offset, unsigned *chroma_offset)
+{
+	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
+	unsigned vpitch = align(enc->luma->npix_y, 16);
+	unsigned fsize = pitch * (vpitch + vpitch / 2);
+
+	*luma_offset = slot->index * fsize;
+	*chroma_offset = *luma_offset + pitch * vpitch;
+}
+
+/**
  * destroy this video encoder
  */
 static void rvce_destroy(struct pipe_video_codec *encoder)
@@ -406,7 +448,19 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	reset_cpb(enc);
 
-	radeon_vce_40_2_2_init(enc);
+	switch (rscreen->info.vce_fw_version) {
+	case FW_40_2_2:
+		radeon_vce_40_2_2_init(enc);
+		break;
+
+	case FW_50_0_1:
+	case FW_50_1_2:
+		radeon_vce_50_init(enc);
+		break;
+
+	default:
+		goto error;
+	}
 
 	return &enc->base;
 
@@ -426,5 +480,7 @@ error:
  */
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
-	return rscreen->info.vce_fw_version == ((40 << 24) | (2 << 16) | (2 << 8));
+	return rscreen->info.vce_fw_version == FW_40_2_2 ||
+		rscreen->info.vce_fw_version == FW_50_0_1 ||
+		rscreen->info.vce_fw_version == FW_50_1_2;
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 1cf018006a8..8319ef48cd5 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -104,6 +104,13 @@ struct rvce_encoder {
 	bool use_vui;
 };
 
+/* CPB handling functions */
+struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc);
+struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc);
+struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc);
+void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
+		       unsigned *luma_offset, unsigned *chroma_offset);
+
 struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     const struct pipe_video_codec *templat,
 					     struct radeon_winsys* ws,
@@ -114,4 +121,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 
+/* init vce fw 50 specific callbacks */
+void radeon_vce_50_init(struct rvce_encoder *enc);
+
 #endif
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 09029575547..51b17b5f6a8 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -46,32 +46,6 @@
 
 static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 };
 
-static struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list);
-}
-
-static struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list);
-}
-
-static struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list);
-}
-
-static void frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-			 unsigned *luma_offset, unsigned *chroma_offset)
-{
-	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
-	unsigned vpitch = align(enc->luma->npix_y, 16);
-	unsigned fsize = pitch * (vpitch + vpitch / 2);
-
-	*luma_offset = slot->index * fsize;
-	*chroma_offset = *luma_offset + pitch * vpitch;
-}
-
 static void session(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x00000001); // session cmd
@@ -369,7 +343,7 @@ static void encode(struct rvce_encoder *enc)
 	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
 	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l0 = l0_slot(enc);
-		frame_offset(enc, l0, &luma_offset, &chroma_offset);
+		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
 		RVCE_CS(l0->picture_type); // encPicType
 		RVCE_CS(l0->frame_num); // frameNumber
 		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
@@ -395,7 +369,7 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // pictureStructure
 	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l1 = l1_slot(enc);
-		frame_offset(enc, l1, &luma_offset, &chroma_offset);
+		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
 		RVCE_CS(l1->picture_type); // encPicType
 		RVCE_CS(l1->frame_num); // frameNumber
 		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
@@ -409,7 +383,7 @@ static void encode(struct rvce_encoder *enc)
 		RVCE_CS(0xffffffff); // chromaOffset
 	}
 
-	frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
+	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
 	RVCE_CS(luma_offset); // encReconstructedLumaOffset
 	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
 	RVCE_CS(0x00000000); // encColocBufferOffset
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
new file mode 100644
index 00000000000..84a2bfb117e
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -0,0 +1,228 @@
+/**************************************************************************
+ *
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <[email protected]>
+ *
+ */
+
+#include <stdio.h>
+
+#include "pipe/p_video_codec.h"
+
+#include "util/u_video.h"
+#include "util/u_memory.h"
+
+#include "vl/vl_video_buffer.h"
+
+#include "r600_pipe_common.h"
+#include "radeon_video.h"
+#include "radeon_vce.h"
+
+static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
+{
+	RVCE_BEGIN(0x00000002); // task info
+	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
+	RVCE_CS(taskOperation); // taskOperation
+	RVCE_CS(0x00000000); // referencePictureDependency
+	RVCE_CS(0x00000000); // collocateFlagDependency
+	RVCE_CS(0x00000000); // feedbackIndex
+	RVCE_CS(0x00000000); // videoBitstreamRingIndex
+	RVCE_END();
+}
+
+static void rate_control(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000005); // rate control
+	RVCE_CS(enc->pic.rate_ctrl.rate_ctrl_method); // encRateControlMethod
+	RVCE_CS(enc->pic.rate_ctrl.target_bitrate); // encRateControlTargetBitRate
+	RVCE_CS(enc->pic.rate_ctrl.peak_bitrate); // encRateControlPeakBitRate
+	RVCE_CS(enc->pic.rate_ctrl.frame_rate_num); // encRateControlFrameRateNum
+	RVCE_CS(0x00000000); // encGOPSize
+	RVCE_CS(enc->pic.quant_i_frames); // encQP_I
+	RVCE_CS(enc->pic.quant_p_frames); // encQP_P
+	RVCE_CS(enc->pic.quant_b_frames); // encQP_B
+	RVCE_CS(enc->pic.rate_ctrl.vbv_buffer_size); // encVBVBufferSize
+	RVCE_CS(enc->pic.rate_ctrl.frame_rate_den); // encRateControlFrameRateDen
+	RVCE_CS(0x00000000); // encVBVBufferLevel
+	RVCE_CS(0x00000000); // encMaxAUSize
+	RVCE_CS(0x00000000); // encQPInitialMode
+	RVCE_CS(enc->pic.rate_ctrl.target_bits_picture); // encTargetBitsPerPicture
+	RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_integer); // encPeakBitsPerPictureInteger
+	RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_fraction); // encPeakBitsPerPictureFractional
+	RVCE_CS(0x00000000); // encMinQP
+	RVCE_CS(0x00000033); // encMaxQP
+	RVCE_CS(0x00000000); // encSkipFrameEnable
+	RVCE_CS(0x00000000); // encFillerDataEnable
+	RVCE_CS(0x00000000); // encEnforceHRD
+	RVCE_CS(0x00000000); // encBPicsDeltaQP
+	RVCE_CS(0x00000000); // encReferenceBPicsDeltaQP
+	RVCE_CS(0x00000000); // encRateControlReInitDisable
+	RVCE_CS(0x00000000); // encLCVBRInitQPFlag
+	RVCE_CS(0x00000000); // encLCVBRSATDBasedNonlinearBitBudgetFlag
+	RVCE_END();
+}
+
+static void encode(struct rvce_encoder *enc)
+{
+	int i;
+	unsigned luma_offset, chroma_offset;
+
+	task_info(enc, 0x00000003);
+
+	RVCE_BEGIN(0x05000001); // context buffer
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
+	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_END();
+
+	RVCE_BEGIN(0x05000004); // video bitstream buffer
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
+	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
+	RVCE_END();
+
+	RVCE_BEGIN(0x03000001); // encode
+	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize
+	RVCE_CS(0x00000000); // forceRefreshMap
+	RVCE_CS(0x00000000); // insertAUD
+	RVCE_CS(0x00000000); // endOfSequence
+	RVCE_CS(0x00000000); // endOfStream
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
+	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
+	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
+	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
+	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+	RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	RVCE_CS(0x00000000); // encInputPicTileConfig
+	RVCE_CS(enc->pic.picture_type); // encPicType
+	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
+	RVCE_CS(0x00000000); // encIdrPicId
+	RVCE_CS(0x00000000); // encMGSKeyPic
+	RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag
+	RVCE_CS(0x00000000); // encTemporalLayerIndex
+	RVCE_CS(0x00000000); // num_ref_idx_active_override_flag
+	RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1
+	RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1
+
+	i = enc->pic.frame_num - enc->pic.ref_idx_l0;
+	if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
+		RVCE_CS(0x00000001); // encRefListModificationOp
+		RVCE_CS(i - 1);      // encRefListModificationNum
+	} else {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+
+	for (i = 0; i < 3; ++i) {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+	for (i = 0; i < 4; ++i) {
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingNum
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum
+	}
+
+	// encReferencePictureL0[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
+	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l0 = l0_slot(enc);
+		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
+		RVCE_CS(l0->picture_type); // encPicType
+		RVCE_CS(l0->frame_num); // frameNumber
+		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	// encReferencePictureL0[1]
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(0x00000000); // encPicType
+	RVCE_CS(0x00000000); // frameNumber
+	RVCE_CS(0x00000000); // pictureOrderCount
+	RVCE_CS(0xffffffff); // lumaOffset
+	RVCE_CS(0xffffffff); // chromaOffset
+
+	// encReferencePictureL1[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l1 = l1_slot(enc);
+		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
+		RVCE_CS(l1->picture_type); // encPicType
+		RVCE_CS(l1->frame_num); // frameNumber
+		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
+	RVCE_CS(luma_offset); // encReconstructedLumaOffset
+	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
+	RVCE_CS(0x00000000); // encColocBufferOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // pictureCount
+	RVCE_CS(enc->pic.frame_num); // frameNumber
+	RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount
+	RVCE_CS(0x00000000); // numIPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numPPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numBPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numIRPicRemainInRCGOP
+	RVCE_CS(0x00000000); // enableIntraRefresh
+	RVCE_END();
+}
+
+void radeon_vce_50_init(struct rvce_encoder *enc)
+{
+	radeon_vce_40_2_2_init(enc);
+
+	/* only the two below are different */
+	enc->rate_control = rate_control;
+	enc->encode = encode;
+}
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 774dc2285c0..2876c0ae735 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -1,4 +1,5 @@
 C_SOURCES := \
+	cik_sdma.c \
 	si_blit.c \
 	si_commands.c \
 	si_compute.c \
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
new file mode 100644
index 00000000000..86111cb86e8
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2010 Jerome Glisse <[email protected]>
+ * Copyright 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+
+#include "sid.h"
+#include "si_pipe.h"
+#include "../radeon/r600_cs.h"
+
+#include "util/u_format.h"
+
+static uint32_t cik_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
+{
+	if (sscreen->b.info.si_tile_mode_array_valid) {
+		uint32_t gb_tile_mode = sscreen->b.info.si_tile_mode_array[tile_mode];
+
+		return G_009910_MICRO_TILE_MODE_NEW(gb_tile_mode);
+	}
+
+	/* The kernel cannod return the tile mode array. Guess? */
+	return V_009910_ADDR_SURF_THIN_MICRO_TILING;
+}
+
+static void cik_sdma_do_copy_buffer(struct si_context *ctx,
+				    struct pipe_resource *dst,
+				    struct pipe_resource *src,
+				    uint64_t dst_offset,
+				    uint64_t src_offset,
+				    uint64_t size)
+{
+	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	unsigned i, ncopy, csize;
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+	struct r600_resource *rsrc = (struct r600_resource*)src;
+
+	dst_offset += r600_resource(dst)->gpu_address;
+	src_offset += r600_resource(src)->gpu_address;
+
+	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
+	r600_need_dma_space(&ctx->b, ncopy * 7);
+
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+			      RADEON_PRIO_MIN);
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+			      RADEON_PRIO_MIN);
+
+	for (i = 0; i < ncopy; i++) {
+		csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE;
+		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+						     CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
+						     0);
+		cs->buf[cs->cdw++] = csize;
+		cs->buf[cs->cdw++] = 0; /* src/dst endian swap */
+		cs->buf[cs->cdw++] = src_offset;
+		cs->buf[cs->cdw++] = src_offset >> 32;
+		cs->buf[cs->cdw++] = dst_offset;
+		cs->buf[cs->cdw++] = dst_offset >> 32;
+		dst_offset += csize;
+		src_offset += csize;
+		size -= csize;
+	}
+}
+
+static void cik_sdma_copy_buffer(struct si_context *ctx,
+				 struct pipe_resource *dst,
+				 struct pipe_resource *src,
+				 uint64_t dst_offset,
+				 uint64_t src_offset,
+				 uint64_t size)
+{
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&rdst->valid_buffer_range, dst_offset,
+		       dst_offset + size);
+
+	cik_sdma_do_copy_buffer(ctx, dst, src, dst_offset, src_offset, size);
+}
+
+static void cik_sdma_copy_tile(struct si_context *ctx,
+			       struct pipe_resource *dst,
+			       unsigned dst_level,
+			       struct pipe_resource *src,
+			       unsigned src_level,
+			       unsigned y,
+			       unsigned copy_height,
+			       unsigned y_align,
+			       unsigned pitch,
+			       unsigned bpe)
+{
+	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct si_screen *sscreen = ctx->screen;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	struct r600_texture *rlinear, *rtiled;
+	unsigned linear_lvl, tiled_lvl;
+	unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size;
+	unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode;
+	unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt;
+	uint64_t base, addr;
+	unsigned pipe_config, tile_mode_index;
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+	assert(dst_mode != src_mode);
+	assert(src_mode == RADEON_SURF_MODE_LINEAR || dst_mode == RADEON_SURF_MODE_LINEAR);
+
+	sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED;
+	lbpe = util_logbase2(bpe);
+	pitch_tile_max = ((pitch / bpe) / 8) - 1;
+
+	detile = dst_mode == RADEON_SURF_MODE_LINEAR;
+	rlinear = detile ? rdst : rsrc;
+	rtiled = detile ? rsrc : rdst;
+	linear_lvl = detile ? dst_level : src_level;
+	tiled_lvl = detile ? src_level : dst_level;
+
+	assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format));
+
+	array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode);
+	slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x *
+			  rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1;
+	height = rlinear->surface.level[linear_lvl].nblk_y;
+	base = rtiled->surface.level[tiled_lvl].offset;
+	addr = rlinear->surface.level[linear_lvl].offset;
+	bank_h = cik_bank_wh(rtiled->surface.bankh);
+	bank_w = cik_bank_wh(rtiled->surface.bankw);
+	mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea);
+	tile_split = cik_tile_split(rtiled->surface.tile_split);
+	tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false);
+	nbanks = si_num_banks(sscreen, rtiled);
+	base += rtiled->resource.gpu_address;
+	addr += rlinear->resource.gpu_address;
+
+	pipe_config = cik_db_pipe_config(sscreen, tile_mode_index);
+	mt = cik_micro_tile_mode(sscreen, tile_mode_index);
+
+	size = (copy_height * pitch) / 4;
+	cheight = copy_height;
+	if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
+		cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
+		cheight &= ~(y_align - 1);
+	}
+	ncopy = (copy_height + cheight - 1) / cheight;
+	r600_need_dma_space(&ctx->b, ncopy * 12);
+
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+			      RADEON_USAGE_READ, RADEON_PRIO_MIN);
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+			      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+
+	copy_height = size * 4 / pitch;
+	for (i = 0; i < ncopy; i++) {
+		cheight = copy_height;
+		if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
+			cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
+			cheight &= ~(y_align - 1);
+		}
+		size = (cheight * pitch) / 4;
+
+		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+						     sub_op, detile << 15);
+		cs->buf[cs->cdw++] = base;
+		cs->buf[cs->cdw++] = base >> 32;
+		cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max;
+		cs->buf[cs->cdw++] = slice_tile_max;
+		cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) |
+			(nbanks << 21) | (bank_h << 18) | (bank_w << 15) |
+			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
+			lbpe;
+		cs->buf[cs->cdw++] = y << 16; /* | x */
+		cs->buf[cs->cdw++] = 0; /* z */;
+		cs->buf[cs->cdw++] = addr & 0xfffffffc;
+		cs->buf[cs->cdw++] = addr >> 32;
+		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
+		cs->buf[cs->cdw++] = size;
+
+		copy_height -= cheight;
+		y += cheight;
+	}
+}
+
+void cik_sdma_copy(struct pipe_context *ctx,
+		   struct pipe_resource *dst,
+		   unsigned dst_level,
+		   unsigned dstx, unsigned dsty, unsigned dstz,
+		   struct pipe_resource *src,
+		   unsigned src_level,
+		   const struct pipe_box *src_box)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned dst_pitch, src_pitch, bpe, dst_mode, src_mode;
+	unsigned src_w, dst_w;
+	unsigned src_x, src_y;
+	unsigned copy_height, y_align;
+	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
+
+	if (sctx->b.rings.dma.cs == NULL) {
+		goto fallback;
+	}
+
+	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+		cik_sdma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
+		return;
+	}
+
+	/* Before re-enabling this, please make sure you can hit all newly
+	 * enabled paths in your testing, preferably with both piglit (in
+	 * particular the streaming-texture-leak test) and real world apps
+	 * (e.g. the UE4 Elemental demo).
+	 */
+	goto fallback;
+
+	if (src->format != dst->format ||
+	    rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 ||
+	    rdst->dirty_level_mask & (1 << dst_level)) {
+		goto fallback;
+	}
+
+	if (rsrc->dirty_level_mask & (1 << src_level)) {
+		if (rsrc->htile_buffer)
+			goto fallback;
+
+		ctx->flush_resource(ctx, src);
+	}
+
+	src_x = util_format_get_nblocksx(src->format, src_box->x);
+	dst_x = util_format_get_nblocksx(src->format, dst_x);
+	src_y = util_format_get_nblocksy(src->format, src_box->y);
+	dst_y = util_format_get_nblocksy(src->format, dst_y);
+
+	dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+	src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+	src_w = rsrc->surface.level[src_level].npix_x;
+	dst_w = rdst->surface.level[dst_level].npix_x;
+
+	if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w ||
+	    src_box->width != src_w ||
+	    rsrc->surface.level[src_level].nblk_y !=
+	    rdst->surface.level[dst_level].nblk_y) {
+		/* FIXME CIK can do partial blit */
+		goto fallback;
+	}
+
+	bpe = rdst->surface.bpe;
+	copy_height = src_box->height / rsrc->surface.blk_h;
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+	/* Dimensions must be aligned to (macro)tiles */
+	switch (src_mode == RADEON_SURF_MODE_LINEAR ? dst_mode : src_mode) {
+	case RADEON_SURF_MODE_1D:
+		if ((src_x % 8) || (src_y % 8) || (dst_x % 8) || (dst_y % 8) ||
+		    (copy_height % 8))
+			goto fallback;
+		y_align = 8;
+		break;
+	case RADEON_SURF_MODE_2D: {
+		unsigned mtilew, mtileh, num_banks;
+
+			switch (si_num_banks(sctx->screen, rsrc)) {
+			case V_02803C_ADDR_SURF_2_BANK:
+			default:
+				num_banks = 2;
+				break;
+			case V_02803C_ADDR_SURF_4_BANK:
+				num_banks = 4;
+				break;
+			case V_02803C_ADDR_SURF_8_BANK:
+				num_banks = 8;
+				break;
+			case V_02803C_ADDR_SURF_16_BANK:
+				num_banks = 16;
+				break;
+			}
+
+			mtilew = (8 * rsrc->surface.bankw *
+				  sctx->screen->b.tiling_info.num_channels) *
+				rsrc->surface.mtilea;
+			assert(!(mtilew & (mtilew - 1)));
+			mtileh = (8 * rsrc->surface.bankh * num_banks) /
+				rsrc->surface.mtilea;
+			assert(!(mtileh & (mtileh - 1)));
+
+			if ((src_x & (mtilew - 1)) || (src_y & (mtileh - 1)) ||
+			    (dst_x & (mtilew - 1)) || (dst_y & (mtileh - 1)) ||
+			    (copy_height & (mtileh - 1)))
+				goto fallback;
+
+			y_align = mtileh;
+			break;
+	}
+	default:
+		y_align = 1;
+	}
+
+	if (src_mode == dst_mode) {
+		uint64_t dst_offset, src_offset;
+		unsigned src_h, dst_h;
+
+		src_h = rsrc->surface.level[src_level].npix_y;
+		dst_h = rdst->surface.level[dst_level].npix_y;
+
+		if (src_box->depth > 1 &&
+		    (src_y || dst_y || src_h != dst_h || src_box->height != src_h))
+			goto fallback;
+
+		/* simple dma blit would do NOTE code here assume :
+		 *   dst_pitch == src_pitch
+		 */
+		src_offset= rsrc->surface.level[src_level].offset;
+		src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+		src_offset += src_y * src_pitch + src_x * bpe;
+		dst_offset = rdst->surface.level[dst_level].offset;
+		dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+		dst_offset += dst_y * dst_pitch + dst_x * bpe;
+		cik_sdma_do_copy_buffer(sctx, dst, src, dst_offset, src_offset,
+					src_box->depth *
+					rsrc->surface.level[src_level].slice_size);
+	} else {
+		if (dst_y != src_y || src_box->depth > 1 || src_box->z || dst_z)
+			goto fallback;
+
+		cik_sdma_copy_tile(sctx, dst, dst_level, src, src_level,
+				   src_y, copy_height, y_align, dst_pitch, bpe);
+	}
+	return;
+
+fallback:
+	si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
+				src, src_level, src_box);
+}
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index db523eef318..7a0076e7aa9 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -30,21 +30,6 @@
 
 #include "util/u_format.h"
 
-static unsigned si_array_mode(unsigned mode)
-{
-	switch (mode) {
-	case RADEON_SURF_MODE_LINEAR_ALIGNED:
-		return V_009910_ARRAY_LINEAR_ALIGNED;
-	case RADEON_SURF_MODE_1D:
-		return V_009910_ARRAY_1D_TILED_THIN1;
-	case RADEON_SURF_MODE_2D:
-		return V_009910_ARRAY_2D_TILED_THIN1;
-	default:
-	case RADEON_SURF_MODE_LINEAR:
-		return V_009910_ARRAY_LINEAR_GENERAL;
-	}
-}
-
 static uint32_t si_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
 {
 	if (sscreen->b.info.si_tile_mode_array_valid) {
@@ -240,11 +225,6 @@ void si_dma_copy(struct pipe_context *ctx,
 		goto fallback;
 	}
 
-	/* TODO: Implement DMA copy for CIK */
-	if (sctx->b.chip_class >= CIK) {
-		goto fallback;
-	}
-
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
 		return;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e68c30e8c7c..53ae71a8c92 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -251,6 +251,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+	case PIPE_CAP_TGSI_TEXCOORD:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -286,13 +287,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
-	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
@@ -451,6 +452,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
 	}
 	return 0;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f98c7a83744..2d67342f160 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -237,6 +237,15 @@ struct si_context {
 	unsigned		spi_tmpring_size;
 };
 
+/* cik_sdma.c */
+void cik_sdma_copy(struct pipe_context *ctx,
+		   struct pipe_resource *dst,
+		   unsigned dst_level,
+		   unsigned dstx, unsigned dsty, unsigned dstz,
+		   struct pipe_resource *src,
+		   unsigned src_level,
+		   const struct pipe_box *src_box);
+
 /* si_blit.c */
 void si_init_blit_functions(struct si_context *sctx);
 void si_flush_depth_textures(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 89f02ab0410..47e5f96cbed 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -128,21 +128,10 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 	case TGSI_SEMANTIC_CLIPDIST:
 		assert(index <= 1);
 		return 2 + index;
-	case TGSI_SEMANTIC_CLIPVERTEX:
-		return 4;
-	case TGSI_SEMANTIC_COLOR:
-		assert(index <= 1);
-		return 5 + index;
-	case TGSI_SEMANTIC_BCOLOR:
-		assert(index <= 1);
-		return 7 + index;
-	case TGSI_SEMANTIC_FOG:
-		return 9;
-	case TGSI_SEMANTIC_EDGEFLAG:
-		return 10;
 	case TGSI_SEMANTIC_GENERIC:
-		assert(index <= 63-11);
-		return 11 + index;
+		assert(index <= 63-4);
+		return 4 + index;
+
 	default:
 		assert(0);
 		return 63;
@@ -1183,6 +1172,7 @@ handle_semantic:
 			continue;
 		case TGSI_SEMANTIC_PRIMID:
 		case TGSI_SEMANTIC_FOG:
+		case TGSI_SEMANTIC_TEXCOORD:
 		case TGSI_SEMANTIC_GENERIC:
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
 			shader->vs_output_param_offset[i] = param_count;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7f0fdd599dc..6c18836d189 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -44,6 +44,21 @@ static void si_init_atom(struct r600_atom *atom, struct r600_atom **list_elem,
 	*list_elem = atom;
 }
 
+unsigned si_array_mode(unsigned mode)
+{
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:
+		return V_009910_ARRAY_LINEAR_ALIGNED;
+	case RADEON_SURF_MODE_1D:
+		return V_009910_ARRAY_1D_TILED_THIN1;
+	case RADEON_SURF_MODE_2D:
+		return V_009910_ARRAY_2D_TILED_THIN1;
+	default:
+	case RADEON_SURF_MODE_LINEAR:
+		return V_009910_ARRAY_LINEAR_GENERAL;
+	}
+}
+
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 {
 	if (sscreen->b.chip_class == CIK &&
@@ -636,18 +651,14 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	rs->offset_units = state->offset_units;
 	rs->offset_scale = state->offset_scale * 12.0f;
 
-	tmp = S_0286D4_FLAT_SHADE_ENA(1);
-	if (state->sprite_coord_enable) {
-		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
-			S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
-			S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
-			S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
-			S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1);
-		if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) {
-			tmp |= S_0286D4_PNT_SPRITE_TOP_1(1);
-		}
-	}
-	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, tmp);
+	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
+		S_0286D4_FLAT_SHADE_ENA(1) |
+		S_0286D4_PNT_SPRITE_ENA(1) |
+		S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+		S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+		S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+		S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+		S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
 
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -2910,11 +2921,16 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
 
-	sctx->b.dma_copy = si_dma_copy;
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
 
 	sctx->b.b.draw_vbo = si_draw_vbo;
+
+	if (sctx->b.chip_class >= CIK) {
+		sctx->b.dma_copy = cik_sdma_copy;
+	} else {
+		sctx->b.dma_copy = si_dma_copy;
+	}
 }
 
 static void
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 2f8a943846a..5e68b162137 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -261,6 +261,7 @@ unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);
 unsigned cik_tile_split(unsigned tile_split);
+unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 1bbc6b3ca7a..208c8523ef1 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -182,8 +182,13 @@ static void si_shader_vs(struct si_shader *shader)
 	for (nparams = 0, i = 0 ; i < info->num_outputs; i++) {
 		switch (info->output_semantic_name[i]) {
 		case TGSI_SEMANTIC_CLIPVERTEX:
+		case TGSI_SEMANTIC_CLIPDIST:
+		case TGSI_SEMANTIC_CULLDIST:
 		case TGSI_SEMANTIC_POSITION:
 		case TGSI_SEMANTIC_PSIZE:
+		case TGSI_SEMANTIC_EDGEFLAG:
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+		case TGSI_SEMANTIC_LAYER:
 			break;
 		default:
 			nparams++;
@@ -351,21 +356,25 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 					  union si_shader_key *key)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	memset(key, 0, sizeof(*key));
+	unsigned i;
 
-	if (sel->type == PIPE_SHADER_VERTEX) {
-		unsigned i;
-		if (!sctx->vertex_elements)
-			return;
+	memset(key, 0, sizeof(*key));
 
-		for (i = 0; i < sctx->vertex_elements->count; ++i)
-			key->vs.instance_divisors[i] = sctx->vertex_elements->elements[i].instance_divisor;
+	switch (sel->type) {
+	case PIPE_SHADER_VERTEX:
+		if (sctx->vertex_elements)
+			for (i = 0; i < sctx->vertex_elements->count; ++i)
+				key->vs.instance_divisors[i] =
+					sctx->vertex_elements->elements[i].instance_divisor;
 
 		if (sctx->gs_shader) {
 			key->vs.as_es = 1;
 			key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs;
 		}
-	} else if (sel->type == PIPE_SHADER_FRAGMENT) {
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		break;
+	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
 		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
@@ -393,11 +402,14 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 		}
 
 		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-
 		/* Alpha-test should be disabled if colorbuffer 0 is integer. */
 		if (sctx->queued.named.dsa &&
 		    !sctx->framebuffer.cb0_is_integer)
 			key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+		break;
+	}
+	default:
+		assert(0);
 	}
 }
 
@@ -580,15 +592,22 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 
 	while (p) {
 		c = p->next_variant;
-		if (sel->type == PIPE_SHADER_GEOMETRY) {
+		switch (sel->type) {
+		case PIPE_SHADER_VERTEX:
+			if (p->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, p->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		case PIPE_SHADER_GEOMETRY:
 			si_pm4_delete_state(sctx, gs, p->pm4);
 			si_pm4_delete_state(sctx, vs, p->gs_copy_shader->pm4);
-		} else if (sel->type == PIPE_SHADER_FRAGMENT)
+			break;
+		case PIPE_SHADER_FRAGMENT:
 			si_pm4_delete_state(sctx, ps, p->pm4);
-		else if (p->key.vs.as_es)
-			si_pm4_delete_state(sctx, es, p->pm4);
-		else
-			si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		}
+
 		si_shader_destroy(ctx, p);
 		free(p);
 		p = c;
@@ -661,8 +680,9 @@ bcolor:
 		    (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
 			tmp |= S_028644_FLAT_SHADE(1);
 
-		if (name == TGSI_SEMANTIC_GENERIC &&
-		    sctx->sprite_coord_enable & (1 << index)) {
+		if (name == TGSI_SEMANTIC_PCOORD ||
+		    (name == TGSI_SEMANTIC_TEXCOORD &&
+		     sctx->sprite_coord_enable & (1 << index))) {
 			tmp |= S_028644_PT_SPRITE_TEX(1);
 		}
 
@@ -835,8 +855,15 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
 		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
 			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
-		if (si_update_scratch_buffer(sctx, sctx->vs_shader))
-			si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+
+		/* VS can be bound as ES or VS. */
+		if (sctx->gs_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+		} else {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index afe011b15c7..35d5ee232a0 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -4516,6 +4516,13 @@
 #define     V_009910_ADDR_SURF_8_BANK                               0x02
 #define     V_009910_ADDR_SURF_16_BANK                              0x03
 /* CIK */
+#define   S_009910_MICRO_TILE_MODE_NEW(x)                             (((x) & 0x07) << 22)
+#define   G_009910_MICRO_TILE_MODE_NEW(x)                             (((x) >> 22) & 0x07)
+#define   C_009910_MICRO_TILE_MODE_NEW(x)                             0xFE3FFFFF
+#define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
+#define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
+#define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
+#define     V_009910_ADDR_SURF_ROTATED_MICRO_TILING                 0x03
 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS                                0x00B01C
 #define   S_00B01C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
 #define   G_00B01C_CU_EN(x)                                           (((x) >> 0) & 0xFFFF)
@@ -8696,5 +8703,29 @@
 #define    SI_DMA_PACKET_CONSTANT_FILL             0xd
 #define    SI_DMA_PACKET_NOP                       0xf
 
+/* CIK async DMA packets */
+#define CIK_SDMA_PACKET(op, sub_op, n)   ((((n) & 0xFFFF) << 16) |	\
+					 (((sub_op) & 0xFF) << 8) |	\
+					 (((op) & 0xFF) << 0))
+/* CIK async DMA packet types */
+#define    CIK_SDMA_OPCODE_NOP                     0x0
+#define    CIK_SDMA_OPCODE_COPY                    0x1
+#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR            0x0
+#define        CIK_SDMA_COPY_SUB_OPCODE_TILED             0x1
+#define        CIK_SDMA_COPY_SUB_OPCODE_SOA               0x3
+#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
+#define        CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW  0x5
+#define        CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW    0x6
+#define    CIK_SDMA_OPCODE_WRITE                   0x2
+#define        SDMA_WRITE_SUB_OPCODE_LINEAR               0x0
+#define        SDMA_WRTIE_SUB_OPCODE_TILED                0x1
+#define    CIK_SDMA_OPCODE_INDIRECT_BUFFER         0x4
+#define    CIK_SDMA_PACKET_FENCE                   0x5
+#define    CIK_SDMA_PACKET_TRAP                    0x6
+#define    CIK_SDMA_PACKET_SEMAPHORE               0x7
+#define    CIK_SDMA_PACKET_CONSTANT_FILL           0xb
+#define    CIK_SDMA_PACKET_SRBM_WRITE              0xe
+#define    CIK_SDMA_COPY_MAX_SIZE                  0x1fffff
+
 #endif /* _SID_H */
 
diff --git a/src/gallium/drivers/rbug/rbug_public.h b/src/gallium/drivers/rbug/rbug_public.h
index b66740b49cd..83f9c94e31f 100644
--- a/src/gallium/drivers/rbug/rbug_public.h
+++ b/src/gallium/drivers/rbug/rbug_public.h
@@ -28,6 +28,10 @@
 #ifndef RBUG_PUBLIC_H
 #define RBUG_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct pipe_context;
 
@@ -37,4 +41,8 @@ rbug_screen_create(struct pipe_screen *screen);
 boolean
 rbug_enabled(void);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* RBUG_PUBLIC_H */
diff --git a/src/gallium/drivers/softpipe/sp_public.h b/src/gallium/drivers/softpipe/sp_public.h
index 62d0903d87a..88a9b5e6643 100644
--- a/src/gallium/drivers/softpipe/sp_public.h
+++ b/src/gallium/drivers/softpipe/sp_public.h
@@ -1,10 +1,18 @@
 #ifndef SP_PUBLIC_H
 #define SP_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct sw_winsys;
 
 struct pipe_screen *
 softpipe_create_screen(struct sw_winsys *winsys);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index e77387082bc..76105b4c0ec 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -277,7 +277,7 @@ softpipe_check_render_cond(struct softpipe_context *sp)
    b = pipe->get_query_result(pipe, sp->render_cond_query, wait,
                               (void*)&result);
    if (b)
-      return (!result == sp->render_cond_cond);
+      return (!result) == sp->render_cond_cond;
    else
       return TRUE;
 }
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index d289e28a6f8..a688d319bb8 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -191,7 +191,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_NATIVE;
    case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+      return 4;
    case PIPE_CAP_TEXTURE_GATHER_SM5:
+      return 1;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
@@ -200,13 +202,15 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
       return 1;
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-   case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 1;
    case PIPE_CAP_FAKE_SW_MSAA:
       return 1;
    case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+      return -32;
    case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-      return 0;
+      return 31;
    case PIPE_CAP_DRAW_INDIRECT:
       return 1;
 
@@ -237,6 +241,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 0;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index e56fb5b1485..d7a3360713f 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -202,7 +202,7 @@ prepare_shader_sampling(
             struct pipe_resource *res = view->texture;
             int j;
 
-            if (res->target != PIPE_BUFFER) {
+            if (view->target != PIPE_BUFFER) {
                first_level = view->u.tex.first_level;
                last_level = view->u.tex.last_level;
                assert(first_level <= last_level);
@@ -214,15 +214,17 @@ prepare_shader_sampling(
                   row_stride[j] = sp_tex->stride[j];
                   img_stride[j] = sp_tex->img_stride[j];
                }
-               if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                   res->target == PIPE_TEXTURE_2D_ARRAY ||
-                   res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                   view->target == PIPE_TEXTURE_2D_ARRAY ||
+                   view->target == PIPE_TEXTURE_CUBE ||
+                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
                                        sp_tex->img_stride[j];
                   }
-                  if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      assert(num_layers % 6 == 0);
                   }
                   assert(view->u.tex.first_layer <= view->u.tex.last_layer);
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 68dcf57240d..1010b63de2c 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -131,68 +131,80 @@ repeat(int coord, unsigned size)
  * \param icoord  returns the integer texcoords
  */
 static void
-wrap_nearest_repeat(float s, unsigned size, int *icoord)
+wrap_nearest_repeat(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1) */
    /* i limited to [0,size-1] */
    int i = util_ifloor(s * size);
-   *icoord = repeat(i, size);
+   *icoord = repeat(i + offset, size);
 }
 
 
 static void
-wrap_nearest_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_clamp(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1] */
    /* i limited to [0,size-1] */
+   s *= size;
+   s += offset;
    if (s <= 0.0F)
       *icoord = 0;
-   else if (s >= 1.0F)
+   else if (s >= size)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
 
 static void
-wrap_nearest_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [0, size-1] */
-   const float min = 1.0F / (2.0F * size);
-   const float max = 1.0F - min;
+   const float min = 0.5F;
+   const float max = (float)size - 0.5F;
+
+   s *= size;
+   s += offset;
+
    if (s < min)
       *icoord = 0;
    else if (s > max)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
 
 static void
-wrap_nearest_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [-1, size] */
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
+   const float min = -0.5F;
+   const float max = size + 0.5F;
+
+   s *= size;
+   s += offset;
    if (s <= min)
       *icoord = -1;
    else if (s >= max)
       *icoord = size;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
-
 static void
-wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_repeat(float s, unsigned size, int offset, int *icoord)
 {
    const float min = 1.0F / (2.0F * size);
    const float max = 1.0F - min;
-   const int flr = util_ifloor(s);
-   float u = frac(s);
+   int flr;
+   float u;
+
+   s += (float)offset / size;
+   flr = util_ifloor(s);
+   u = frac(s);
    if (flr & 1)
       u = 1.0F - u;
    if (u < min)
@@ -205,51 +217,52 @@ wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord)
 
 
 static void
-wrap_nearest_mirror_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1] */
    /* i limited to [0,size-1] */
-   const float u = fabsf(s);
+   const float u = fabsf(s * size + offset);
    if (u <= 0.0F)
       *icoord = 0;
-   else if (u >= 1.0F)
+   else if (u >= size)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
 static void
-wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [0, size-1] */
-   const float min = 1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   const float u = fabsf(s);
+   const float min = 0.5F;
+   const float max = (float)size - 0.5F;
+   const float u = fabsf(s * size + offset);
+
    if (u < min)
       *icoord = 0;
    else if (u > max)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
 static void
-wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
-   /* s limited to [min,max] */
-   /* i limited to [0, size-1] */
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   const float u = fabsf(s);
+   /* u limited to [-0.5, size-0.5] */
+   const float min = -0.5F;
+   const float max = (float)size + 0.5F;
+   const float u = fabsf(s * size + offset);
+
    if (u < min)
       *icoord = -1;
    else if (u > max)
       *icoord = size;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
@@ -264,22 +277,23 @@ wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord)
  * \param icoord  returns the computed integer texture coord
  */
 static void
-wrap_linear_repeat(float s, unsigned size,
+wrap_linear_repeat(float s, unsigned size, int offset,
                    int *icoord0, int *icoord1, float *w)
 {
    float u = s * size - 0.5F;
-   *icoord0 = repeat(util_ifloor(u), size);
+   *icoord0 = repeat(util_ifloor(u) + offset, size);
    *icoord1 = repeat(*icoord0 + 1, size);
    *w = frac(u);
 }
 
 
 static void
-wrap_linear_clamp(float s, unsigned size,
+wrap_linear_clamp(float s, unsigned size, int offset,
                   int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, 0.0F, 1.0F);
-   u = u * size - 0.5f;
+   float u = CLAMP(s * size + offset, 0.0F, (float)size);
+
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -287,11 +301,11 @@ wrap_linear_clamp(float s, unsigned size,
 
 
 static void
-wrap_linear_clamp_to_edge(float s, unsigned size,
+wrap_linear_clamp_to_edge(float s, unsigned size, int offset,
                           int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, 0.0F, 1.0F);
-   u = u * size - 0.5f;
+   float u = CLAMP(s * size + offset, 0.0F, (float)size);
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    if (*icoord0 < 0)
@@ -303,13 +317,13 @@ wrap_linear_clamp_to_edge(float s, unsigned size,
 
 
 static void
-wrap_linear_clamp_to_border(float s, unsigned size,
+wrap_linear_clamp_to_border(float s, unsigned size, int offset,
                             int *icoord0, int *icoord1, float *w)
 {
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   float u = CLAMP(s, min, max);
-   u = u * size - 0.5f;
+   const float min = -0.5F;
+   const float max = (float)size + 0.5F;
+   float u = CLAMP(s * size + offset, min, max);
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -317,11 +331,15 @@ wrap_linear_clamp_to_border(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_repeat(float s, unsigned size,
+wrap_linear_mirror_repeat(float s, unsigned size, int offset,
                           int *icoord0, int *icoord1, float *w)
 {
-   const int flr = util_ifloor(s);
-   float u = frac(s);
+   int flr;
+   float u;
+
+   s += (float)offset / size;
+   flr = util_ifloor(s);
+   u = frac(s);
    if (flr & 1)
       u = 1.0F - u;
    u = u * size - 0.5F;
@@ -336,14 +354,12 @@ wrap_linear_mirror_repeat(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp(float s, unsigned size,
+wrap_linear_mirror_clamp(float s, unsigned size, int offset,
                          int *icoord0, int *icoord1, float *w)
 {
-   float u = fabsf(s);
-   if (u >= 1.0F)
+   float u = fabsf(s * size + offset);
+   if (u >= size)
       u = (float) size;
-   else
-      u *= size;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -352,14 +368,12 @@ wrap_linear_mirror_clamp(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp_to_edge(float s, unsigned size,
+wrap_linear_mirror_clamp_to_edge(float s, unsigned size, int offset,
                                  int *icoord0, int *icoord1, float *w)
 {
-   float u = fabsf(s);
-   if (u >= 1.0F)
+   float u = fabsf(s * size + offset);
+   if (u >= size)
       u = (float) size;
-   else
-      u *= size;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -372,18 +386,16 @@ wrap_linear_mirror_clamp_to_edge(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp_to_border(float s, unsigned size,
+wrap_linear_mirror_clamp_to_border(float s, unsigned size, int offset,
                                    int *icoord0, int *icoord1, float *w)
 {
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   float u = fabsf(s);
+   const float min = -0.5F;
+   const float max = size + 0.5F;
+   float u = fabsf(s * size + offset);
    if (u <= min)
-      u = min * size;
+      u = min;
    else if (u >= max)
-      u = max * size;
-   else
-      u *= size;
+      u = max;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -395,10 +407,10 @@ wrap_linear_mirror_clamp_to_border(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp(float s, unsigned size, int offset, int *icoord)
 {
    int i = util_ifloor(s);
-   *icoord = CLAMP(i, 0, (int) size-1);
+   *icoord = CLAMP(i + offset, 0, (int) size-1);
 }
 
 
@@ -406,9 +418,9 @@ wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP_TO_BORDER for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
-   *icoord = util_ifloor( CLAMP(s, -0.5F, (float) size + 0.5F) );
+   *icoord = util_ifloor( CLAMP(s + offset, -0.5F, (float) size + 0.5F) );
 }
 
 
@@ -416,9 +428,9 @@ wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP_TO_EDGE for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
-   *icoord = util_ifloor( CLAMP(s, 0.5F, (float) size - 0.5F) );
+   *icoord = util_ifloor( CLAMP(s + offset, 0.5F, (float) size - 0.5F) );
 }
 
 
@@ -426,11 +438,11 @@ wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp(float s, unsigned size,
+wrap_linear_unorm_clamp(float s, unsigned size, int offset,
                         int *icoord0, int *icoord1, float *w)
 {
    /* Not exactly what the spec says, but it matches NVIDIA output */
-   float u = CLAMP(s - 0.5F, 0.0f, (float) size - 1.0f);
+   float u = CLAMP(s + offset - 0.5F, 0.0f, (float) size - 1.0f);
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -441,10 +453,10 @@ wrap_linear_unorm_clamp(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP_TO_BORDER for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp_to_border(float s, unsigned size,
+wrap_linear_unorm_clamp_to_border(float s, unsigned size, int offset,
                                   int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, -0.5F, (float) size + 0.5F);
+   float u = CLAMP(s + offset, -0.5F, (float) size + 0.5F);
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -458,10 +470,10 @@ wrap_linear_unorm_clamp_to_border(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP_TO_EDGE for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp_to_edge(float s, unsigned size,
+wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset,
                                 int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, +0.5F, (float) size - 0.5F);
+   float u = CLAMP(s + offset, +0.5F, (float) size - 0.5F);
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -474,11 +486,11 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size,
 /**
  * Do coordinate to array index conversion.  For array textures.
  */
-static INLINE void
-wrap_array_layer(float coord, unsigned size, int *layer)
+static INLINE int
+coord_to_layer(float coord, unsigned first_layer, unsigned last_layer)
 {
    int c = util_ifloor(coord + 0.5F);
-   *layer = CLAMP(c, 0, (int) size - 1);
+   return CLAMP(c, (int)first_layer, (int)last_layer);
 }
 
 
@@ -757,61 +769,6 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 }
 
 
-static INLINE const float *
-get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
-                        union tex_tile_address addr, int x, int y,
-                        float *corner)
-{
-   const struct pipe_resource *texture = sp_sview->base.texture;
-   unsigned level = addr.bits.level;
-   unsigned face = addr.bits.face;
-   int new_x, new_y, max_x;
-
-   max_x = (int) u_minify(texture->width0, level);
-
-   assert(texture->width0 == texture->height0);
-   new_x = x;
-   new_y = y;
-
-   /* change the face */
-   if (x < 0) {
-      /*
-       * Cheat with corners. They are difficult and I believe because we don't get
-       * per-pixel faces we can actually have multiple corner texels per pixel,
-       * which screws things up majorly in any case (as the per spec behavior is
-       * to average the 3 remaining texels, which we might not have).
-       * Hence just make sure that the 2nd coord is clamped, will simply pick the
-       * sample which would have fallen off the x coord, but not y coord.
-       * So the filter weight of the samples will be wrong, but at least this
-       * ensures that only valid texels near the corner are used.
-       */
-      if (y < 0 || y >= max_x) {
-         y = CLAMP(y, 0, max_x - 1);
-      }
-      new_x = get_next_xcoord(face, 0, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 0, max_x -1, x, y);
-      face = get_next_face(face, 0);
-   } else if (x >= max_x) {
-      if (y < 0 || y >= max_x) {
-         y = CLAMP(y, 0, max_x - 1);
-      }
-      new_x = get_next_xcoord(face, 1, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 1, max_x -1, x, y);
-      face = get_next_face(face, 1);
-   } else if (y < 0) {
-      new_x = get_next_xcoord(face, 2, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 2, max_x -1, x, y);
-      face = get_next_face(face, 2);
-   } else if (y >= max_x) {
-      new_x = get_next_xcoord(face, 3, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 3, max_x -1, x, y);
-      face = get_next_face(face, 3);
-   }
-
-   addr.bits.face = face;
-   return get_texel_2d_no_border( sp_sview, addr, new_x, new_y );
-}
-
 /* Gather a quad of adjacent texels within a tile:
  */
 static INLINE void
@@ -948,6 +905,60 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview,
 }
 
 
+static INLINE const float *
+get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
+                        union tex_tile_address addr, int x, int y,
+                        float *corner, int layer, unsigned face)
+{
+   const struct pipe_resource *texture = sp_sview->base.texture;
+   unsigned level = addr.bits.level;
+   int new_x, new_y, max_x;
+
+   max_x = (int) u_minify(texture->width0, level);
+
+   assert(texture->width0 == texture->height0);
+   new_x = x;
+   new_y = y;
+
+   /* change the face */
+   if (x < 0) {
+      /*
+       * Cheat with corners. They are difficult and I believe because we don't get
+       * per-pixel faces we can actually have multiple corner texels per pixel,
+       * which screws things up majorly in any case (as the per spec behavior is
+       * to average the 3 remaining texels, which we might not have).
+       * Hence just make sure that the 2nd coord is clamped, will simply pick the
+       * sample which would have fallen off the x coord, but not y coord.
+       * So the filter weight of the samples will be wrong, but at least this
+       * ensures that only valid texels near the corner are used.
+       */
+      if (y < 0 || y >= max_x) {
+         y = CLAMP(y, 0, max_x - 1);
+      }
+      new_x = get_next_xcoord(face, 0, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 0, max_x -1, x, y);
+      face = get_next_face(face, 0);
+   } else if (x >= max_x) {
+      if (y < 0 || y >= max_x) {
+         y = CLAMP(y, 0, max_x - 1);
+      }
+      new_x = get_next_xcoord(face, 1, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 1, max_x -1, x, y);
+      face = get_next_face(face, 1);
+   } else if (y < 0) {
+      new_x = get_next_xcoord(face, 2, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 2, max_x -1, x, y);
+      face = get_next_face(face, 2);
+   } else if (y >= max_x) {
+      new_x = get_next_xcoord(face, 3, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 3, max_x -1, x, y);
+      face = get_next_face(face, 3);
+   }
+
+   return get_texel_3d_no_border(sp_sview, addr, new_x, new_y, layer + face);
+}
+
+
 /* Get texel pointer for cube array texture */
 static INLINE const float *
 get_texel_cube_array(const struct sp_sampler_view *sp_sview,
@@ -1008,22 +1019,18 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ
 static INLINE void
 img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float *rgba)
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    int xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
    int ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot - 0.5F;
-   float v = t * ypot - 0.5F;
+   float u = (args->s * xpot - 0.5F) + args->offset[0];
+   float v = (args->t * ypot - 0.5F) + args->offset[1];
 
    int uflr = util_ifloor(u);
    int vflr = util_ifloor(v);
@@ -1037,7 +1044,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
    const float *tx[4];
       
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /* Can we fetch all four at once:
     */
@@ -1066,21 +1073,17 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
 static INLINE void
 img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
                                  struct sp_sampler *sp_samp,
-                                 float s,
-                                 float t,
-                                 float p,
-                                 unsigned level,
-                                 unsigned face_id,
+                                 const struct img_filter_args *args,
                                  float rgba[TGSI_QUAD_SIZE])
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    const float *out;
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot;
-   float v = t * ypot;
+   float u = args->s * xpot + args->offset[0];
+   float v = args->t * ypot + args->offset[1];
 
    int uflr = util_ifloor(u);
    int vflr = util_ifloor(v);
@@ -1089,7 +1092,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
    int y0 = vflr & (ypot - 1);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1104,26 +1107,22 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
 static INLINE void
 img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float rgba[TGSI_QUAD_SIZE])
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot;
-   float v = t * ypot;
+   float u = args->s * xpot + args->offset[0];
+   float v = args->t * ypot + args->offset[1];
 
    int x0, y0;
    const float *out;
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    x0 = util_ifloor(u);
    if (x0 < 0) 
@@ -1150,11 +1149,7 @@ img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float rgba[TGSI_QUAD_SIZE])
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1164,14 +1159,14 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
 
    out = get_texel_2d(sp_sview, sp_samp, addr, x, 0);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1186,11 +1181,7 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
                             struct sp_sampler *sp_samp,
-                            float s,
-                            float t,
-                            float p,
-                            unsigned level,
-                            unsigned face_id,
+                            const struct img_filter_args *args,
                             float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1200,15 +1191,16 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   wrap_array_layer(t, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    out = get_texel_1d_array(sp_sview, sp_samp, addr, x, layer);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1223,11 +1215,7 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1237,17 +1225,17 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
 
    out = get_texel_2d(sp_sview, sp_samp, addr, x, y);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1262,11 +1250,7 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
                             struct sp_sampler *sp_samp,
-                            float s,
-                            float t,
-                            float p,
-                            unsigned level,
-                            unsigned face_id,
+                            const struct img_filter_args *args,
                             float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1276,18 +1260,19 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   wrap_array_layer(p, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    out = get_texel_2d_array(sp_sview, sp_samp, addr, x, y, layer);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1299,54 +1284,43 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE union tex_tile_address
-face(union tex_tile_address addr, unsigned face )
-{
-   addr.bits.face = face;
-   return addr;
-}
-
-
 static void
 img_filter_cube_nearest(struct sp_sampler_view *sp_sview,
                         struct sp_sampler *sp_samp,
-                        float s,
-                        float t,
-                        float p,
-                        unsigned level,
-                        unsigned face_id,
+                        const struct img_filter_args *args,
                         float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x, y;
+   int x, y, layerface;
    union tex_tile_address addr;
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /*
     * If NEAREST filtering is done within a miplevel, always apply wrap
     * mode CLAMP_TO_EDGE.
     */
    if (sp_samp->base.seamless_cube_map) {
-      wrap_nearest_clamp_to_edge(s, width, &x);
-      wrap_nearest_clamp_to_edge(t, height, &y);
+      wrap_nearest_clamp_to_edge(args->s, width, args->offset[0], &x);
+      wrap_nearest_clamp_to_edge(args->t, height, args->offset[1], &y);
    } else {
       /* Would probably make sense to ignore mode and just do edge clamp */
-      sp_samp->nearest_texcoord_s(s, width, &x);
-      sp_samp->nearest_texcoord_t(t, height, &y);
+      sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+      sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
    }
 
-   out = get_texel_2d(sp_sview, sp_samp, face(addr, face_id), x, y);
+   layerface = args->face_id + sp_sview->base.u.tex.first_layer;
+   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1358,34 +1332,32 @@ img_filter_cube_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview,
                               struct sp_sampler *sp_samp,
-                              float s,
-                              float t,
-                              float p,
-                              unsigned level,
-                              unsigned face_id,
+                              const struct img_filter_args *args,
                               float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x, y, layer;
+   int x, y, layerface;
    union tex_tile_address addr;
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   wrap_array_layer(p, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   layerface = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer,
+                              sp_sview->base.u.tex.first_layer,
+                              sp_sview->base.u.tex.last_layer - 5) + args->face_id;
 
-   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layer * 6 + face_id);
+   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1397,11 +1369,7 @@ img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1411,20 +1379,20 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
-   depth = u_minify(texture->depth0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
+   depth = u_minify(texture->depth0, args->level);
 
    assert(width > 0);
    assert(height > 0);
    assert(depth > 0);
 
-   sp_samp->nearest_texcoord_s(s, width,  &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   sp_samp->nearest_texcoord_p(p, depth,  &z);
+   sp_samp->nearest_texcoord_s(args->s, width,  args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   sp_samp->nearest_texcoord_p(args->p, depth,  args->offset[2], &z);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    out = get_texel_3d(sp_sview, sp_samp, addr, x, y, z);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1435,11 +1403,7 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1450,14 +1414,14 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview,
    const float *tx0, *tx1;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw);
+   sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
 
    tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, 0);
    tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, 0);
@@ -1471,11 +1435,7 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
                            struct sp_sampler *sp_samp,
-                           float s,
-                           float t,
-                           float p,
-                           unsigned level,
-                           unsigned face_id,
+                           const struct img_filter_args *args,
                            float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1486,15 +1446,16 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
    const float *tx0, *tx1;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw);
-   wrap_array_layer(t, texture->array_size, &layer);
+   sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
+   layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    tx0 = get_texel_1d_array(sp_sview, sp_samp, addr, x0, layer);
    tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1, layer);
@@ -1504,15 +1465,77 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
       rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]);
 }
 
+/*
+ * Retrieve the gathered value, need to convert to the
+ * TGSI expected interface, and take component select
+ * and swizzling into account.
+ */
+static float
+get_gather_value(const struct sp_sampler_view *sp_sview,
+                 int chan_in, int comp_sel,
+                 const float *tx[4])
+{
+   int chan;
+   unsigned swizzle;
+
+   /*
+    * softpipe samples in a different order
+    * to TGSI expects, so we need to swizzle,
+    * the samples into the correct slots.
+    */
+   switch (chan_in) {
+   case 0:
+      chan = 2;
+      break;
+   case 1:
+      chan = 3;
+      break;
+   case 2:
+      chan = 1;
+      break;
+   case 3:
+      chan = 0;
+      break;
+   default:
+      assert(0);
+      return 0.0;
+   }
+
+   /* pick which component to use for the swizzle */
+   switch (comp_sel) {
+   case 0:
+      swizzle = sp_sview->base.swizzle_r;
+      break;
+   case 1:
+      swizzle = sp_sview->base.swizzle_g;
+      break;
+   case 2:
+      swizzle = sp_sview->base.swizzle_b;
+      break;
+   case 3:
+      swizzle = sp_sview->base.swizzle_a;
+      break;
+   default:
+      assert(0);
+      return 0.0;
+   }
+
+   /* get correct result using the channel and swizzle */
+   switch (swizzle) {
+   case PIPE_SWIZZLE_ZERO:
+      return 0.0;
+   case PIPE_SWIZZLE_ONE:
+      return 1.0;
+   default:
+      return tx[chan][swizzle];
+   }
+}
+
 
 static void
 img_filter_2d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1520,42 +1543,45 @@ img_filter_2d_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
 
-   tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, y0);
-   tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, y0);
-   tx2 = get_texel_2d(sp_sview, sp_samp, addr, x0, y1);
-   tx3 = get_texel_2d(sp_sview, sp_samp, addr, x1, y1);
+   tx[0] = get_texel_2d(sp_sview, sp_samp, addr, x0, y0);
+   tx[1] = get_texel_2d(sp_sview, sp_samp, addr, x1, y0);
+   tx[2] = get_texel_2d(sp_sview, sp_samp, addr, x0, y1);
+   tx[3] = get_texel_2d(sp_sview, sp_samp, addr, x1, y1);
 
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 
 static void
 img_filter_2d_array_linear(struct sp_sampler_view *sp_sview,
                            struct sp_sampler *sp_samp,
-                           float s,
-                           float t,
-                           float p,
-                           unsigned level,
-                           unsigned face_id,
+                           const struct img_filter_args *args,
                            float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1563,63 +1589,67 @@ img_filter_2d_array_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
-
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   wrap_array_layer(p, texture->array_size, &layer);
-
-   tx0 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer);
-   tx1 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer);
-   tx2 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer);
-   tx3 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer);
-
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   addr.bits.level = args->level;
+
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
+
+   tx[0] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer);
+   tx[1] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer);
+   tx[2] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer);
+   tx[3] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer);
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 
 static void
 img_filter_cube_linear(struct sp_sampler_view *sp_sview,
                        struct sp_sampler *sp_samp,
-                       float s,
-                       float t,
-                       float p,
-                       unsigned level,
-                       unsigned face_id,
+                       const struct img_filter_args *args,
                        float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x0, y0, x1, y1;
+   int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
-   union tex_tile_address addr, addrj;
-   const float *tx0, *tx1, *tx2, *tx3;
+   union tex_tile_address addr;
+   const float *tx[4];
    float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE],
          corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /*
     * For seamless if LINEAR filtering is done within a miplevel,
@@ -1627,43 +1657,47 @@ img_filter_cube_linear(struct sp_sampler_view *sp_sview,
     */
    if (sp_samp->base.seamless_cube_map) {
       /* Note this is a bit overkill, actual clamping is not required */
-      wrap_linear_clamp_to_border(s, width, &x0, &x1, &xw);
-      wrap_linear_clamp_to_border(t, height, &y0, &y1, &yw);
+      wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw);
+      wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw);
    } else {
       /* Would probably make sense to ignore mode and just do edge clamp */
-      sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-      sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
+      sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+      sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
    }
 
-   addrj = face(addr, face_id);
+   layer = sp_sview->base.u.tex.first_layer;
 
    if (sp_samp->base.seamless_cube_map) {
-      tx0 = get_texel_cube_seamless(sp_sview, addrj, x0, y0, corner0);
-      tx1 = get_texel_cube_seamless(sp_sview, addrj, x1, y0, corner1);
-      tx2 = get_texel_cube_seamless(sp_sview, addrj, x0, y1, corner2);
-      tx3 = get_texel_cube_seamless(sp_sview, addrj, x1, y1, corner3);
+      tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id);
+      tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id);
+      tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id);
+      tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id);
    } else {
-      tx0 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y0);
-      tx1 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y0);
-      tx2 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y1);
-      tx3 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y1);
+      tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id);
+      tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id);
+      tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id);
+      tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id);
+   }
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
    }
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
 }
 
 
 static void
 img_filter_cube_array_linear(struct sp_sampler_view *sp_sview,
                              struct sp_sampler *sp_samp,
-                             float s,
-                             float t,
-                             float p,
-                             unsigned level,
-                             unsigned face_id,
+                             const struct img_filter_args *args,
                              float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1671,42 +1705,68 @@ img_filter_cube_array_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
+   float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE],
+         corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   wrap_array_layer(p, texture->array_size, &layer);
+   /*
+    * For seamless if LINEAR filtering is done within a miplevel,
+    * always apply wrap mode CLAMP_TO_BORDER.
+    */
+   if (sp_samp->base.seamless_cube_map) {
+      /* Note this is a bit overkill, actual clamping is not required */
+      wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw);
+      wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw);
+   } else {
+      /* Would probably make sense to ignore mode and just do edge clamp */
+      sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+      sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   }
 
-   tx0 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer * 6 + face_id);
-   tx1 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer * 6 + face_id);
-   tx2 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer * 6 + face_id);
-   tx3 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer * 6 + face_id);
+   layer = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer - 5);
 
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   if (sp_samp->base.seamless_cube_map) {
+      tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id);
+      tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id);
+      tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id);
+      tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id);
+   } else {
+      tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id);
+      tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id);
+      tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id);
+      tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id);
+   }
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 static void
 img_filter_3d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1717,21 +1777,20 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview,
    const float *tx00, *tx01, *tx02, *tx03, *tx10, *tx11, *tx12, *tx13;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
-   depth = u_minify(texture->depth0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
+   depth = u_minify(texture->depth0, args->level);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    assert(width > 0);
    assert(height > 0);
    assert(depth > 0);
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   sp_samp->linear_texcoord_p(p, depth,  &z0, &z1, &zw);
-
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   sp_samp->linear_texcoord_p(args->p, depth,  args->offset[2], &z0, &z1, &zw);
 
    tx00 = get_texel_3d(sp_sview, sp_samp, addr, x0, y0, z0);
    tx01 = get_texel_3d(sp_sview, sp_samp, addr, x1, y0, z0);
@@ -1837,6 +1896,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
       }
       break;
    case tgsi_sampler_lod_zero:
+   case tgsi_sampler_gather:
       /* this is all static state in the sampler really need clamp here? */
       lod[0] = lod[1] = lod[2] = lod[3] = CLAMP(lod_bias, min_lod, max_lod);
       break;
@@ -1846,6 +1906,12 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
    }
 }
 
+static INLINE unsigned
+get_gather_component(const float lod_in[TGSI_QUAD_SIZE])
+{
+   /* gather component is stored in lod_in slot as unsigned */
+   return (*(unsigned int *)lod_in) & 0x3;
+}
 
 static void
 mip_filter_linear(struct sp_sampler_view *sp_sview,
@@ -1857,36 +1923,45 @@ mip_filter_linear(struct sp_sampler_view *sp_sview,
                   const float p[TGSI_QUAD_SIZE],
                   const float c0[TGSI_QUAD_SIZE],
                   const float lod_in[TGSI_QUAD_SIZE],
-                  enum tgsi_sampler_control control,
+                  const struct filter_args *filt_args,
                   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    int j;
    float lod[TGSI_QUAD_SIZE];
+   struct img_filter_args args;
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
+
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   args.gather_comp = get_gather_component(lod_in);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       int level0 = psview->u.tex.first_level + (int)lod[j];
 
-      if (lod[j] < 0.0)
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    psview->u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
-
-      else if (level0 >= (int) psview->u.tex.last_level)
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
 
+      if (lod[j] < 0.0) {
+         args.level = psview->u.tex.first_level;
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
+      else if (level0 >= (int) psview->u.tex.last_level) {
+         args.level = psview->u.tex.last_level;
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
       else {
          float levelBlend = frac(lod[j]);
          float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
          int c;
 
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0,
-                    sp_sview->faces[j], &rgbax[0][0]);
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0+1,
-                    sp_sview->faces[j], &rgbax[0][1]);
+         args.level = level0;
+         min_filter(sp_sview, sp_samp, &args, &rgbax[0][0]);
+         args.level = level0+1;
+         min_filter(sp_sview, sp_samp, &args, &rgbax[0][1]);
 
          for (c = 0; c < 4; c++) {
             rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]);
@@ -1915,25 +1990,33 @@ mip_filter_nearest(struct sp_sampler_view *sp_sview,
                    const float p[TGSI_QUAD_SIZE],
                    const float c0[TGSI_QUAD_SIZE],
                    const float lod_in[TGSI_QUAD_SIZE],
-                   enum tgsi_sampler_control control,
+                   const struct filter_args *filt_args,
                    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    float lod[TGSI_QUAD_SIZE];
    int j;
+   struct img_filter_args args;
+
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   args.gather_comp = get_gather_component(lod_in);
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      if (lod[j] < 0.0)
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    psview->u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
-      else {
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+
+      if (lod[j] < 0.0) {
+         args.level = psview->u.tex.first_level;
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      } else {
          int level = psview->u.tex.first_level + (int)(lod[j] + 0.5F);
-         level = MIN2(level, (int)psview->u.tex.last_level);
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    level, sp_sview->faces[j], &rgba[0][j]);
+         args.level = MIN2(level, (int)psview->u.tex.last_level);
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
    }
 
@@ -1953,24 +2036,29 @@ mip_filter_none(struct sp_sampler_view *sp_sview,
                 const float p[TGSI_QUAD_SIZE],
                 const float c0[TGSI_QUAD_SIZE],
                 const float lod_in[TGSI_QUAD_SIZE],
-                enum tgsi_sampler_control control,
+                const struct filter_args *filt_args,
                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    float lod[TGSI_QUAD_SIZE];
    int j;
+   struct img_filter_args args;
+
+   args.level = sp_sview->base.u.tex.first_level;
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      if (lod[j] < 0.0) { 
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    sp_sview->base.u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      if (lod[j] < 0.0) {
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
       else {
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    sp_sview->base.u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
    }
 }
@@ -1986,15 +2074,21 @@ mip_filter_none_no_filter_select(struct sp_sampler_view *sp_sview,
                                  const float p[TGSI_QUAD_SIZE],
                                  const float c0[TGSI_QUAD_SIZE],
                                  const float lod_in[TGSI_QUAD_SIZE],
-                                 enum tgsi_sampler_control control,
+                                 const struct filter_args *filt_args,
                                  float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    int j;
-
-   for (j = 0; j < TGSI_QUAD_SIZE; j++)
-      mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                 sp_sview->base.u.tex.first_level,
-                 sp_sview->faces[j], &rgba[0][j]);
+   struct img_filter_args args;
+   args.level = sp_sview->base.u.tex.first_level;
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+   }
 }
 
 
@@ -2050,7 +2144,7 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
    float scaling = 1.0f / (1 << level0);
    int width = u_minify(texture->width0, level0);
    int height = u_minify(texture->height0, level0);
-
+   struct img_filter_args args;
    float ux = dudx * scaling;
    float vx = dvdx * scaling;
    float uy = dudy * scaling;
@@ -2100,7 +2194,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
     * full, then the pixel values are read from the image.
     */
    ddq = 2 * A;
-   
+
+   args.level = level;
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
        * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
@@ -2117,6 +2212,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
       float num[4] = {0.0F, 0.0F, 0.0F, 0.0F};
       buffer_next = 0;
       den = 0;
+      args.face_id = sp_sview->faces[j];
+
       U = u0 - tex_u;
       for (v = v0; v <= v1; ++v) {
          float V = v - tex_v;
@@ -2148,8 +2245,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
                    * accelerated img_filter_2d_nearest_XXX functions.
                    */
                   for (jj = 0; jj < buffer_next; jj++) {
-                     min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj],
-                                level, sp_sview->faces[j], &rgba_temp[0][jj]);
+                     args.s = s_buffer[jj];
+                     args.t = t_buffer[jj];
+                     args.p = p[jj];
+                     min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]);
                      num[0] += weight_buffer[jj] * rgba_temp[0][jj];
                      num[1] += weight_buffer[jj] * rgba_temp[1][jj];
                      num[2] += weight_buffer[jj] * rgba_temp[2][jj];
@@ -2176,8 +2275,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
           * accelerated img_filter_2d_nearest_XXX functions.
           */
          for (jj = 0; jj < buffer_next; jj++) {
-            min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj],
-                       level, sp_sview->faces[j], &rgba_temp[0][jj]);
+            args.s = s_buffer[jj];
+            args.t = t_buffer[jj];
+            args.p = p[jj];
+            min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]);
             num[0] += weight_buffer[jj] * rgba_temp[0][jj];
             num[1] += weight_buffer[jj] * rgba_temp[1][jj];
             num[2] += weight_buffer[jj] * rgba_temp[2][jj];
@@ -2196,8 +2297,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
          rgba[2]=0;
          rgba[3]=0;*/
          /* not enough pixels in resampling, resort to direct interpolation */
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level,
-                    sp_sview->faces[j], &rgba_temp[0][j]);
+         args.s = s[j];
+         args.t = t[j];
+         args.p = p[j];
+         min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][j]);
          den = 1;
          num[0] = rgba_temp[0][j];
          num[1] = rgba_temp[1][j];
@@ -2226,7 +2329,7 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
                         const float p[TGSI_QUAD_SIZE],
                         const float c0[TGSI_QUAD_SIZE],
                         const float lod_in[TGSI_QUAD_SIZE],
-                        enum tgsi_sampler_control control,
+                        const struct filter_args *filt_args,
                         float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -2241,11 +2344,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
    float dudy = (s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]) * s_to_u;
    float dvdx = (t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]) * t_to_v;
    float dvdy = (t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]) * t_to_v;
-   
-   if (control == tgsi_sampler_lod_bias ||
-       control == tgsi_sampler_lod_none ||
+   struct img_filter_args args;
+
+   if (filt_args->control == tgsi_sampler_lod_bias ||
+       filt_args->control == tgsi_sampler_lod_none ||
        /* XXX FIXME */
-       control == tgsi_sampler_derivs_explicit) {
+       filt_args->control == tgsi_sampler_derivs_explicit) {
       /* note: instead of working with Px and Py, we will use the 
        * squared length instead, to avoid sqrt.
        */
@@ -2282,12 +2386,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
        * this since 0.5*log(x) = log(sqrt(x))
        */
       lambda = 0.5F * util_fast_log2(Pmin2) + sp_samp->base.lod_bias;
-      compute_lod(&sp_samp->base, control, lambda, lod_in, lod);
+      compute_lod(&sp_samp->base, filt_args->control, lambda, lod_in, lod);
    }
    else {
-      assert(control == tgsi_sampler_lod_explicit ||
-             control == tgsi_sampler_lod_zero);
-      compute_lod(&sp_samp->base, control, sp_samp->base.lod_bias, lod_in, lod);
+      assert(filt_args->control == tgsi_sampler_lod_explicit ||
+             filt_args->control == tgsi_sampler_lod_zero);
+      compute_lod(&sp_samp->base, filt_args->control, sp_samp->base.lod_bias, lod_in, lod);
    }
    
    /* XXX: Take into account all lod values.
@@ -2300,9 +2404,14 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
     */
    if (level0 >= (int) psview->u.tex.last_level) {
       int j;
-      for (j = 0; j < TGSI_QUAD_SIZE; j++)
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         args.s = s[j];
+         args.t = t[j];
+         args.p = p[j];
+         args.level = psview->u.tex.last_level;
+         args.face_id = sp_sview->faces[j];
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
    }
    else {
       /* don't bother interpolating between multiple LODs; it doesn't
@@ -2334,29 +2443,33 @@ mip_filter_linear_2d_linear_repeat_POT(
    const float p[TGSI_QUAD_SIZE],
    const float c0[TGSI_QUAD_SIZE],
    const float lod_in[TGSI_QUAD_SIZE],
-   enum tgsi_sampler_control control,
+   const struct filter_args *filt_args,
    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    int j;
    float lod[TGSI_QUAD_SIZE];
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       int level0 = psview->u.tex.first_level + (int)lod[j];
-
+      struct img_filter_args args;
       /* Catches both negative and large values of level0:
        */
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      args.offset = filt_args->offset;
+      args.gather_only = filt_args->control == tgsi_sampler_gather;
       if ((unsigned)level0 >= psview->u.tex.last_level) {
          if (level0 < 0)
-            img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j],
-                                            psview->u.tex.first_level,
-                                            sp_sview->faces[j], &rgba[0][j]);
+            args.level = psview->u.tex.first_level;
          else
-            img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j],
-                                            psview->u.tex.last_level,
-                                            sp_sview->faces[j], &rgba[0][j]);
+            args.level = psview->u.tex.last_level;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args,
+                                         &rgba[0][j]);
 
       }
       else {
@@ -2364,10 +2477,10 @@ mip_filter_linear_2d_linear_repeat_POT(
          float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
          int c;
 
-         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0,
-                                         sp_sview->faces[j], &rgbax[0][0]);
-         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0+1,
-                                         sp_sview->faces[j], &rgbax[0][1]);
+         args.level = level0;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][0]);
+         args.level = level0+1;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][1]);
 
          for (c = 0; c < TGSI_NUM_CHANNELS; c++)
             rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]);
@@ -2395,11 +2508,12 @@ sample_compare(struct sp_sampler_view *sp_sview,
                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_state *sampler = &sp_samp->base;
-   int j;
-   int k[4];
+   int j, v;
+   int k[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
    float pc[4];
    const struct util_format_description *format_desc;
    unsigned chan_type;
+   bool is_gather = (control == tgsi_sampler_gather);
 
    /**
     * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
@@ -2408,13 +2522,13 @@ sample_compare(struct sp_sampler_view *sp_sview,
     * RGBA channels.  We look at the red channel here.
     */
 
-   if (sp_sview->base.texture->target == PIPE_TEXTURE_2D_ARRAY ||
-       sp_sview->base.texture->target == PIPE_TEXTURE_CUBE) {
+   if (sp_sview->base.target == PIPE_TEXTURE_2D_ARRAY ||
+       sp_sview->base.target == PIPE_TEXTURE_CUBE) {
       pc[0] = c0[0];
       pc[1] = c0[1];
       pc[2] = c0[2];
       pc[3] = c0[3];
-   } else if (sp_sview->base.texture->target == PIPE_TEXTURE_CUBE_ARRAY) {
+   } else if (sp_sview->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
       pc[0] = c1[0];
       pc[1] = c1[1];
       pc[2] = c1[2];
@@ -2443,65 +2557,74 @@ sample_compare(struct sp_sampler_view *sp_sview,
       pc[3] = CLAMP(pc[3], 0.0F, 1.0F);
    }
 
-   /* compare four texcoords vs. four texture samples */
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k[0] = pc[0] < rgba[0][0];
-      k[1] = pc[1] < rgba[0][1];
-      k[2] = pc[2] < rgba[0][2];
-      k[3] = pc[3] < rgba[0][3];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k[0] = pc[0] <= rgba[0][0];
-      k[1] = pc[1] <= rgba[0][1];
-      k[2] = pc[2] <= rgba[0][2];
-      k[3] = pc[3] <= rgba[0][3];
-      break;
-   case PIPE_FUNC_GREATER:
-      k[0] = pc[0] > rgba[0][0];
-      k[1] = pc[1] > rgba[0][1];
-      k[2] = pc[2] > rgba[0][2];
-      k[3] = pc[3] > rgba[0][3];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k[0] = pc[0] >= rgba[0][0];
-      k[1] = pc[1] >= rgba[0][1];
-      k[2] = pc[2] >= rgba[0][2];
-      k[3] = pc[3] >= rgba[0][3];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k[0] = pc[0] == rgba[0][0];
-      k[1] = pc[1] == rgba[0][1];
-      k[2] = pc[2] == rgba[0][2];
-      k[3] = pc[3] == rgba[0][3];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k[0] = pc[0] != rgba[0][0];
-      k[1] = pc[1] != rgba[0][1];
-      k[2] = pc[2] != rgba[0][2];
-      k[3] = pc[3] != rgba[0][3];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k[0] = k[1] = k[2] = k[3] = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k[0] = k[1] = k[2] = k[3] = 0;
-      break;
-   default:
-      k[0] = k[1] = k[2] = k[3] = 0;
-      assert(0);
-      break;
+   for (v = 0; v < (is_gather ? TGSI_NUM_CHANNELS : 1); v++) {
+      /* compare four texcoords vs. four texture samples */
+      switch (sampler->compare_func) {
+      case PIPE_FUNC_LESS:
+         k[v][0] = pc[0] < rgba[v][0];
+         k[v][1] = pc[1] < rgba[v][1];
+         k[v][2] = pc[2] < rgba[v][2];
+         k[v][3] = pc[3] < rgba[v][3];
+         break;
+      case PIPE_FUNC_LEQUAL:
+         k[v][0] = pc[0] <= rgba[v][0];
+         k[v][1] = pc[1] <= rgba[v][1];
+         k[v][2] = pc[2] <= rgba[v][2];
+         k[v][3] = pc[3] <= rgba[v][3];
+         break;
+      case PIPE_FUNC_GREATER:
+         k[v][0] = pc[0] > rgba[v][0];
+         k[v][1] = pc[1] > rgba[v][1];
+         k[v][2] = pc[2] > rgba[v][2];
+         k[v][3] = pc[3] > rgba[v][3];
+         break;
+      case PIPE_FUNC_GEQUAL:
+         k[v][0] = pc[0] >= rgba[v][0];
+         k[v][1] = pc[1] >= rgba[v][1];
+         k[v][2] = pc[2] >= rgba[v][2];
+         k[v][3] = pc[3] >= rgba[v][3];
+         break;
+      case PIPE_FUNC_EQUAL:
+         k[v][0] = pc[0] == rgba[v][0];
+         k[v][1] = pc[1] == rgba[v][1];
+         k[v][2] = pc[2] == rgba[v][2];
+         k[v][3] = pc[3] == rgba[v][3];
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         k[v][0] = pc[0] != rgba[v][0];
+         k[v][1] = pc[1] != rgba[v][1];
+         k[v][2] = pc[2] != rgba[v][2];
+         k[v][3] = pc[3] != rgba[v][3];
+         break;
+      case PIPE_FUNC_ALWAYS:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 1;
+         break;
+      case PIPE_FUNC_NEVER:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0;
+         break;
+      default:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0;
+         assert(0);
+         break;
+      }
    }
 
-   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      rgba[0][j] = k[j];
-      rgba[1][j] = k[j];
-      rgba[2][j] = k[j];
-      rgba[3][j] = 1.0F;
+   if (is_gather) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         for (v = 0; v < TGSI_NUM_CHANNELS; v++) {
+            rgba[v][j] = k[v][j];
+         }
+      }
+   } else {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba[0][j] = k[0][j];
+         rgba[1][j] = k[0][j];
+         rgba[2][j] = k[0][j];
+         rgba[3][j] = 1.0F;
+      }
    }
 }
 
-
 static void
 do_swizzling(const struct pipe_sampler_view *sview,
              float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
@@ -2679,9 +2802,9 @@ any_swizzle(const struct pipe_sampler_view *view)
 static img_filter_func
 get_img_filter(const struct sp_sampler_view *sp_sview,
                const struct pipe_sampler_state *sampler,
-               unsigned filter)
+               unsigned filter, bool gather)
 {
-   switch (sp_sview->base.texture->target) {
+   switch (sp_sview->base.target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
       if (filter == PIPE_TEX_FILTER_NEAREST) 
@@ -2699,7 +2822,7 @@ get_img_filter(const struct sp_sampler_view *sp_sview,
    case PIPE_TEXTURE_RECT:
       /* Try for fast path:
        */
-      if (sp_sview->pot2d &&
+      if (!gather && sp_sview->pot2d &&
           sampler->wrap_s == sampler->wrap_t &&
           sampler->normalized_coords) 
       {
@@ -2769,35 +2892,38 @@ sample_mip(struct sp_sampler_view *sp_sview,
            const float p[TGSI_QUAD_SIZE],
            const float c0[TGSI_QUAD_SIZE],
            const float lod[TGSI_QUAD_SIZE],
-           enum tgsi_sampler_control control,
+           const struct filter_args *filt_args,
            float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    mip_filter_func mip_filter;
    img_filter_func min_img_filter = NULL;
    img_filter_func mag_img_filter = NULL;
 
-   if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) {
+   if (filt_args->control == tgsi_sampler_gather) {
+      mip_filter = mip_filter_nearest;
+      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, PIPE_TEX_FILTER_LINEAR, true);
+   } else if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) {
       mip_filter = mip_filter_linear_2d_linear_repeat_POT;
    }
    else {
       mip_filter = sp_samp->mip_filter;
-      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter);
+      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter, false);
       if (sp_samp->min_mag_equal) {
          mag_img_filter = min_img_filter;
       }
       else {
-         mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter);
+         mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter, false);
       }
    }
 
    mip_filter(sp_sview, sp_samp, min_img_filter, mag_img_filter,
-              s, t, p, c0, lod, control, rgba);
+              s, t, p, c0, lod, filt_args, rgba);
 
    if (sp_samp->base.compare_mode != PIPE_TEX_COMPARE_NONE) {
-      sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, control, rgba);
+      sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, filt_args->control, rgba);
    }
 
-   if (sp_sview->need_swizzle) {
+   if (sp_sview->need_swizzle && filt_args->control != tgsi_sampler_gather) {
       float rgba_temp[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
       memcpy(rgba_temp, rgba, sizeof(rgba_temp));
       do_swizzling(&sp_sview->base, rgba_temp, rgba);
@@ -2818,7 +2944,7 @@ sample_cube(struct sp_sampler_view *sp_sview,
             const float p[TGSI_QUAD_SIZE],
             const float c0[TGSI_QUAD_SIZE],
             const float c1[TGSI_QUAD_SIZE],
-            enum tgsi_sampler_control control,
+            const struct filter_args *filt_args,
             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    unsigned j;
@@ -2896,7 +3022,7 @@ sample_cube(struct sp_sampler_view *sp_sview,
       }
    }
 
-   sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, control, rgba);
+   sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, filt_args, rgba);
 }
 
 
@@ -2907,7 +3033,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level,
    const struct pipe_sampler_view *view = &sp_sview->base;
    const struct pipe_resource *texture = view->texture;
 
-   if (texture->target == PIPE_BUFFER) {
+   if (view->target == PIPE_BUFFER) {
       dims[0] = (view->u.buf.last_element - view->u.buf.first_element) + 1;
       /* the other values are undefined, but let's avoid potential valgrind
        * warnings.
@@ -2924,7 +3050,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level,
    dims[3] = view->u.tex.last_level - view->u.tex.first_level + 1;
    dims[0] = u_minify(texture->width0, level);
 
-   switch(texture->target) {
+   switch (view->target) {
    case PIPE_TEXTURE_1D_ARRAY:
       dims[1] = view->u.tex.last_layer - view->u.tex.first_layer + 1;
       /* fallthrough */
@@ -2975,13 +3101,16 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
 
    addr.value = 0;
    /* TODO write a better test for LOD */
-   addr.bits.level = lod[0];
+   addr.bits.level = sp_sview->base.target == PIPE_BUFFER ? 0 :
+                        CLAMP(lod[0] + sp_sview->base.u.tex.first_level, 
+                              sp_sview->base.u.tex.first_level,
+                              sp_sview->base.u.tex.last_level);
 
    width = u_minify(texture->width0, addr.bits.level);
    height = u_minify(texture->height0, addr.bits.level);
    depth = u_minify(texture->depth0, addr.bits.level);
 
-   switch(texture->target) {
+   switch (sp_sview->base.target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
@@ -2995,7 +3124,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
    case PIPE_TEXTURE_1D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
-         int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer);
+         int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer,
+                       sp_sview->base.u.tex.last_layer);
          tx = get_texel_2d_no_border(sp_sview, addr, x, y);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
@@ -3017,7 +3147,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
          int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
-         int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer);
+         int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer,
+                           sp_sview->base.u.tex.last_layer);
          tx = get_texel_3d_no_border(sp_sview, addr, x, y, layer);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
@@ -3140,7 +3271,7 @@ softpipe_get_lambda_func(const struct pipe_sampler_view *view, unsigned shader)
    if (shader != PIPE_SHADER_FRAGMENT)
       return compute_lambda_vert;
 
-   switch (view->texture->target) {
+   switch (view->target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_1D_ARRAY:
@@ -3176,19 +3307,49 @@ softpipe_create_sampler_view(struct pipe_context *pipe,
       pipe_resource_reference(&view->texture, resource);
       view->context = pipe;
 
+#ifdef DEBUG
+     /*
+      * This is possibly too lenient, but the primary reason is just
+      * to catch state trackers which forget to initialize this, so
+      * it only catches clearly impossible view targets.
+      */
+      if (view->target != resource->target) {
+         if (view->target == PIPE_TEXTURE_1D)
+            assert(resource->target == PIPE_TEXTURE_1D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_1D_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_1D);
+         else if (view->target == PIPE_TEXTURE_2D)
+            assert(resource->target == PIPE_TEXTURE_2D_ARRAY ||
+                   resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_2D_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_2D ||
+                   resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE)
+            assert(resource->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                   resource->target == PIPE_TEXTURE_2D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_2D_ARRAY);
+         else
+            assert(0);
+      }
+#endif
+
       if (any_swizzle(view)) {
          sview->need_swizzle = TRUE;
       }
 
-      if (resource->target == PIPE_TEXTURE_CUBE ||
-          resource->target == PIPE_TEXTURE_CUBE_ARRAY)
+      if (view->target == PIPE_TEXTURE_CUBE ||
+          view->target == PIPE_TEXTURE_CUBE_ARRAY)
          sview->get_samples = sample_cube;
       else {
          sview->get_samples = sample_mip;
       }
       sview->pot2d = spr->pot &&
-                     (resource->target == PIPE_TEXTURE_2D ||
-                      resource->target == PIPE_TEXTURE_RECT);
+                     (view->target == PIPE_TEXTURE_2D ||
+                      view->target == PIPE_TEXTURE_RECT);
 
       sview->xpot = util_logbase2( resource->width0 );
       sview->ypot = util_logbase2( resource->height0 );
@@ -3230,7 +3391,7 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler,
                     float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    struct sp_tgsi_sampler *sp_samp = (struct sp_tgsi_sampler *)tgsi_sampler;
-
+   struct filter_args filt_args;
    assert(sview_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
    assert(sampler_index < PIPE_MAX_SAMPLERS);
    assert(sp_samp->sp_sampler[sampler_index]);
@@ -3244,9 +3405,12 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler,
       }
       return;
    }
+
+   filt_args.control = control;
+   filt_args.offset = offset;
    sp_samp->sp_sview[sview_index].get_samples(&sp_samp->sp_sview[sview_index],
                                               sp_samp->sp_sampler[sampler_index],
-                                              s, t, p, c0, lod, control, rgba);
+                                              s, t, p, c0, lod, &filt_args, rgba);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 00a97c5186b..7d1aafc4473 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -38,10 +38,12 @@ struct sp_sampler;
 
 typedef void (*wrap_nearest_func)(float s,
                                   unsigned size,
+                                  int offset,
                                   int *icoord);
 
 typedef void (*wrap_linear_func)(float s, 
                                  unsigned size,
+                                 int offset,
                                  int *icoord0,
                                  int *icoord1,
                                  float *w);
@@ -51,15 +53,27 @@ typedef float (*compute_lambda_func)(const struct sp_sampler_view *sp_sview,
                                      const float t[TGSI_QUAD_SIZE],
                                      const float p[TGSI_QUAD_SIZE]);
 
+struct img_filter_args {
+   float s;
+   float t;
+   float p;
+   unsigned level;
+   unsigned face_id;
+   const int8_t *offset;
+   bool gather_only;
+   int gather_comp;
+};
+
 typedef void (*img_filter_func)(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float *rgba);
 
+struct filter_args {
+   enum tgsi_sampler_control control;
+   const int8_t *offset;
+};
+
 typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 img_filter_func min_filter,
@@ -69,7 +83,7 @@ typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview,
                                 const float p[TGSI_QUAD_SIZE],
                                 const float c0[TGSI_QUAD_SIZE],
                                 const float lod[TGSI_QUAD_SIZE],
-                                enum tgsi_sampler_control control,
+                                const struct filter_args *args,
                                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
 
 
@@ -80,7 +94,7 @@ typedef void (*filter_func)(struct sp_sampler_view *sp_sview,
                             const float p[TGSI_QUAD_SIZE],
                             const float c0[TGSI_QUAD_SIZE],
                             const float lod[TGSI_QUAD_SIZE],
-                            enum tgsi_sampler_control control,
+                            const struct filter_args *args,
                             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index ab8ba60849a..4a421a8f882 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -151,7 +151,7 @@ sp_tex_tile_cache_set_sampler_view(struct softpipe_tex_tile_cache *tc,
          tc->entries[i].addr.bits.invalid = 1;
       }
 
-      tc->tex_face = -1; /* any invalid value here */
+      tc->tex_z = -1; /* any invalid value here */
    }
 }
 
@@ -172,7 +172,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
       for (pos = 0; pos < Elements(tc->entries); pos++) {
          tc->entries[pos].addr.bits.invalid = 1;
       }
-      tc->tex_face = -1;
+      tc->tex_z = -1;
    }
 
 }
@@ -190,8 +190,7 @@ tex_cache_pos( union tex_tile_address addr )
 {
    uint entry = (addr.bits.x + 
                  addr.bits.y * 9 + 
-                 addr.bits.z * 3 + 
-                 addr.bits.face + 
+                 addr.bits.z +
                  addr.bits.level * 7);
 
    return entry % NUM_TEX_TILE_ENTRIES;
@@ -226,7 +225,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
 
       /* check if we need to get a new transfer */
       if (!tc->tex_trans ||
-          tc->tex_face != addr.bits.face ||
           tc->tex_level != addr.bits.level ||
           tc->tex_z != addr.bits.z) {
          /* get new transfer (view into texture) */
@@ -245,7 +243,7 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
          }
          else {
             height = u_minify(tc->texture->height0, addr.bits.level);
-            layer = addr.bits.face + addr.bits.z;
+            layer = addr.bits.z;
          }
 
          tc->tex_trans_map =
@@ -255,7 +253,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
                               PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED,
                               0, 0, width, height, &tc->tex_trans);
 
-         tc->tex_face = addr.bits.face;
          tc->tex_level = addr.bits.level;
          tc->tex_z = addr.bits.z;
       }
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 4eb42460552..2233effc439 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -55,7 +55,6 @@ union tex_tile_address {
       unsigned x:TEX_ADDR_BITS;  /* 16K / TILE_SIZE */
       unsigned y:TEX_ADDR_BITS;  /* 16K / TILE_SIZE */
       unsigned z:TEX_Z_BITS;     /* 16K -- z not tiled */
-      unsigned face:3;
       unsigned level:4;
       unsigned invalid:1;
    } bits;
@@ -94,7 +93,7 @@ struct softpipe_tex_tile_cache
 
    struct pipe_transfer *tex_trans;
    void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
+   int tex_level, tex_z;
 
    unsigned swizzle_r;
    unsigned swizzle_g;
@@ -141,7 +140,6 @@ tex_tile_address( unsigned x,
    addr.bits.x = x / TEX_TILE_SIZE;
    addr.bits.y = y / TEX_TILE_SIZE;
    addr.bits.z = z;
-   addr.bits.face = face;
    addr.bits.level = level;
 
    return addr;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index b75f0386449..56e486786df 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -308,6 +308,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
 
@@ -376,6 +377,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       }
       /* If we get here, we failed to handle a cap above */
@@ -433,6 +435,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       }
       /* If we get here, we failed to handle a cap above */
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 7a12b52e2dd..bac956066a5 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -1900,7 +1900,7 @@ emit_tex(struct svga_shader_emitter *emit,
                       emit->key.fkey.tex[unit].swizzle_b != PIPE_SWIZZLE_BLUE ||
                       emit->key.fkey.tex[unit].swizzle_a != PIPE_SWIZZLE_ALPHA);
 
-   boolean saturate = insn->Instruction.Saturate != TGSI_SAT_NONE;
+   boolean saturate = insn->Instruction.Saturate;
 
    /* If doing compare processing or tex swizzle or saturation, we need to put
     * the fetched color into a temporary so it can be used as a source later on.
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 0b56517e696..0013c963e7a 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -553,6 +553,8 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
 TRACE_SHADER_STATE(fs)
 TRACE_SHADER_STATE(vs)
 TRACE_SHADER_STATE(gs)
+TRACE_SHADER_STATE(tcs)
+TRACE_SHADER_STATE(tes)
 
 #undef TRACE_SHADER_STATE
 
@@ -1508,6 +1510,23 @@ static void trace_context_memory_barrier(struct pipe_context *_context,
 }
 
 
+static void trace_context_set_tess_state(struct pipe_context *_context,
+                                         const float default_outer_level[4],
+                                         const float default_inner_level[2])
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_tess_state");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg_array(float, default_outer_level, 4);
+   trace_dump_arg_array(float, default_inner_level, 2);
+   trace_dump_call_end();
+
+   context->set_tess_state(context, default_outer_level, default_inner_level);
+}
+
+
 static const struct debug_named_value rbug_blocker_flags[] = {
    {"before", 1, NULL},
    {"after", 2, NULL},
@@ -1566,6 +1585,12 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(create_gs_state);
    TR_CTX_INIT(bind_gs_state);
    TR_CTX_INIT(delete_gs_state);
+   TR_CTX_INIT(create_tcs_state);
+   TR_CTX_INIT(bind_tcs_state);
+   TR_CTX_INIT(delete_tcs_state);
+   TR_CTX_INIT(create_tes_state);
+   TR_CTX_INIT(bind_tes_state);
+   TR_CTX_INIT(delete_tes_state);
    TR_CTX_INIT(create_vertex_elements_state);
    TR_CTX_INIT(bind_vertex_elements_state);
    TR_CTX_INIT(delete_vertex_elements_state);
@@ -1597,6 +1622,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(flush);
    TR_CTX_INIT(texture_barrier);
    TR_CTX_INIT(memory_barrier);
+   TR_CTX_INIT(set_tess_state);
 
    TR_CTX_INIT(transfer_map);
    TR_CTX_INIT(transfer_unmap);
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 71273380434..9bf4a722d80 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -709,6 +709,8 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_member(uint, state, start_instance);
    trace_dump_member(uint, state, instance_count);
 
+   trace_dump_member(uint, state, vertices_per_patch);
+
    trace_dump_member(int,  state, index_bias);
    trace_dump_member(uint, state, min_index);
    trace_dump_member(uint, state, max_index);
diff --git a/src/gallium/drivers/trace/tr_public.h b/src/gallium/drivers/trace/tr_public.h
index aee4937dd4f..b03133f8d97 100644
--- a/src/gallium/drivers/trace/tr_public.h
+++ b/src/gallium/drivers/trace/tr_public.h
@@ -28,6 +28,8 @@
 #ifndef TR_PUBLIC_H
 #define TR_PUBLIC_H
 
+#include "pipe/p_compiler.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/drivers/vc4/kernel/Makefile.am b/src/gallium/drivers/vc4/Android.mk
index 1ae5f1c2e83..f42a152aa8c 100644
--- a/src/gallium/drivers/vc4/kernel/Makefile.am
+++ b/src/gallium/drivers/vc4/Android.mk
@@ -1,4 +1,4 @@
-# Copyright © 2014 Broadcom
+# Copyright (C) 2014 Emil Velikov <[email protected]>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -7,34 +7,31 @@
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
 
-include Makefile.sources
-include $(top_srcdir)/src/gallium/Automake.inc
+LOCAL_PATH := $(call my-dir)
 
-if USE_VC4_SIMULATOR
-SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1
-endif
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
 
-AM_CFLAGS = \
-	$(LIBDRM_CFLAGS) \
-	$(GALLIUM_DRIVER_CFLAGS) \
-	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
-	-I$(srcdir)/../ \
-	$()
+include $(CLEAR_VARS)
 
-noinst_LTLIBRARIES = libvc4_kernel.la
+LOCAL_SRC_FILES := \
+	$(C_SOURCES)
 
-libvc4_kernel_la_SOURCES = $(C_SOURCES)
-libvc4_kernel_la_LDFLAGS = $(SIM_LDFLAGS)
+LOCAL_SHARED_LIBRARIES := libdrm
+# We need libmesa_glsl to get NIR's generated include directories.
+LOCAL_STATIC_LIBRARIES := libmesa_glsl
+LOCAL_MODULE := libmesa_pipe_vc4
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3fc591f10c1..3f62ce21a9f 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,7 +19,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-SUBDIRS = kernel
+AUTOMAKE_OPTIONS = subdir-objects
 
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
@@ -39,5 +39,5 @@ AM_CFLAGS = \
 noinst_LTLIBRARIES = libvc4.la
 
 libvc4_la_SOURCES = $(C_SOURCES)
-libvc4_la_LIBADD = $(SIM_LIB) kernel/libvc4_kernel.la
+libvc4_la_LIBADD = $(SIM_LIB)
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 49474df3548..1eb029e67e7 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -1,4 +1,10 @@
 C_SOURCES := \
+	kernel/vc4_drv.h \
+	kernel/vc4_gem.c \
+	kernel/vc4_packet.h \
+	kernel/vc4_render_cl.c \
+	kernel/vc4_validate.c \
+	kernel/vc4_validate_shaders.c \
 	vc4_blit.c \
 	vc4_bufmgr.c \
 	vc4_bufmgr.h \
@@ -20,7 +26,6 @@ C_SOURCES := \
 	vc4_opt_dead_code.c \
 	vc4_opt_small_immediates.c \
 	vc4_opt_vpm_writes.c \
-	vc4_packet.h \
 	vc4_program.c \
 	vc4_qir.c \
 	vc4_qir_lower_uniforms.c \
diff --git a/src/gallium/drivers/vc4/kernel/Makefile.sources b/src/gallium/drivers/vc4/kernel/Makefile.sources
deleted file mode 100644
index 7d17a898ebf..00000000000
--- a/src/gallium/drivers/vc4/kernel/Makefile.sources
+++ /dev/null
@@ -1,6 +0,0 @@
-C_SOURCES := \
-	vc4_drv.h \
-	vc4_gem.c \
-	vc4_validate.c \
-	vc4_validate_shaders.c \
-	$()
diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 325f944bf25..1fd8aa9fb28 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -28,8 +28,6 @@
 
 enum vc4_bo_mode {
 	VC4_MODE_UNDECIDED,
-	VC4_MODE_TILE_ALLOC,
-	VC4_MODE_TSDA,
 	VC4_MODE_RENDER,
 	VC4_MODE_SHADER,
 };
@@ -52,6 +50,11 @@ struct vc4_exec_info {
 	struct vc4_bo_exec_state *bo;
 	uint32_t bo_count;
 
+	/* List of other BOs used in the job that need to be released
+	 * once the job is complete.
+	 */
+	struct list_head unref_list;
+
 	/* Current unvalidated indices into @bo loaded by the non-hardware
 	 * VC4_PACKET_GEM_HANDLES.
 	 */
@@ -83,14 +86,11 @@ struct vc4_exec_info {
 	uint32_t shader_state_count;
 
 	bool found_tile_binning_mode_config_packet;
-	bool found_tile_rendering_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
-	bool found_wait_on_semaphore_packet;
 	uint8_t bin_tiles_x, bin_tiles_y;
-	uint32_t fb_width, fb_height;
-	uint32_t tile_alloc_init_block_size;
-	struct drm_gem_cma_object *tile_alloc_bo;
+	struct drm_gem_cma_object *tile_bo;
+	uint32_t tile_alloc_offset;
 
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
@@ -157,13 +157,10 @@ struct vc4_validated_shader_info
 
 /* vc4_validate.c */
 int
-vc4_validate_cl(struct drm_device *dev,
-                void *validated,
-                void *unvalidated,
-                uint32_t len,
-                bool is_bin,
-                bool has_bin,
-                struct vc4_exec_info *exec);
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec);
 
 int
 vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
@@ -171,4 +168,16 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
+bool vc4_use_bo(struct vc4_exec_info *exec,
+		uint32_t hindex,
+		enum vc4_bo_mode mode,
+		struct drm_gem_cma_object **obj);
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_check_tex_size(struct vc4_exec_info *exec,
+			struct drm_gem_cma_object *fbo,
+			uint32_t offset, uint8_t tiling_format,
+			uint32_t width, uint32_t height, uint8_t cpp);
+
 #endif /* VC4_DRV_H */
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index ac29ab35dbc..e4b7fea5968 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -25,24 +25,26 @@
 
 #include "vc4_drv.h"
 
-int
-vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
+/*
+ * Copies in the user's binning command list and generates the validated bin
+ * CL, along with associated data (shader records, uniforms).
+ */
+static int
+vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 {
 	struct drm_vc4_submit_cl *args = exec->args;
 	void *temp = NULL;
-	void *bin, *render;
+	void *bin;
 	int ret = 0;
 	uint32_t bin_offset = 0;
-	uint32_t render_offset = bin_offset + args->bin_cl_size;
-	uint32_t shader_rec_offset = roundup(render_offset +
-					     args->render_cl_size, 16);
+	uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
+					     16);
 	uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
 	uint32_t exec_size = uniforms_offset + args->uniforms_size;
 	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
 					  args->shader_rec_count);
 
-	if (shader_rec_offset < render_offset ||
-	    uniforms_offset < shader_rec_offset ||
+	if (uniforms_offset < shader_rec_offset ||
 	    exec_size < uniforms_offset ||
 	    args->shader_rec_count >= (UINT_MAX /
 					  sizeof(struct vc4_shader_state)) ||
@@ -66,7 +68,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 	bin = temp + bin_offset;
-	render = temp + render_offset;
 	exec->shader_rec_u = temp + shader_rec_offset;
 	exec->uniforms_u = temp + uniforms_offset;
 	exec->shader_state = temp + exec_size;
@@ -80,14 +81,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 
-	ret = copy_from_user(render,
-			     (void __user *)(uintptr_t)args->render_cl,
-			     args->render_cl_size);
-	if (ret) {
-		DRM_ERROR("Failed to copy in render cl\n");
-		goto fail;
-	}
-
 	ret = copy_from_user(exec->shader_rec_u,
 			     (void __user *)(uintptr_t)args->shader_rec,
 			     args->shader_rec_size);
@@ -114,8 +107,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 #endif
 
+	list_addtail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
+		     &exec->unref_list);
+
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
-	exec->ct1ca = exec->exec_bo->paddr + render_offset;
 
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
@@ -125,23 +120,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
 	exec->uniforms_size = args->uniforms_size;
 
-	ret = vc4_validate_cl(dev,
-			      exec->exec_bo->vaddr + bin_offset,
-			      bin,
-			      args->bin_cl_size,
-			      true,
-			      args->bin_cl_size != 0,
-			      exec);
-	if (ret)
-		goto fail;
-
-	ret = vc4_validate_cl(dev,
-			      exec->exec_bo->vaddr + render_offset,
-			      render,
-			      args->render_cl_size,
-			      false,
-			      args->bin_cl_size != 0,
-			      exec);
+	ret = vc4_validate_bin_cl(dev,
+				  exec->exec_bo->vaddr + bin_offset,
+				  bin,
+				  exec);
 	if (ret)
 		goto fail;
 
@@ -152,4 +134,25 @@ fail:
 	return ret;
 }
 
+int
+vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	int ret = 0;
+
+	if (exec->args->bin_cl_size != 0) {
+		ret = vc4_get_bcl(dev, exec);
+		if (ret)
+			goto fail;
+	} else {
+		exec->ct0ca = exec->ct0ea = 0;
+	}
+
+	ret = vc4_get_rcl(dev, exec);
+	if (ret)
+		goto fail;
+
+fail:
+	return ret;
+}
+
 #endif /* USE_VC4_SIMULATOR */
diff --git a/src/gallium/drivers/vc4/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 181f2e01dc9..88cfc0fa9f0 100644
--- a/src/gallium/drivers/vc4/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -81,6 +81,38 @@ enum vc4_packet {
         VC4_PACKET_GEM_HANDLES = 254,
 } __attribute__ ((__packed__));
 
+#define VC4_PACKET_HALT_SIZE						1
+#define VC4_PACKET_NOP_SIZE						1
+#define VC4_PACKET_FLUSH_SIZE						1
+#define VC4_PACKET_FLUSH_ALL_SIZE					1
+#define VC4_PACKET_START_TILE_BINNING_SIZE				1
+#define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
+#define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
+#define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
+#define VC4_PACKET_GL_SHADER_STATE_SIZE					5
+#define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
+#define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
+#define VC4_PACKET_POINT_SIZE_SIZE					5
+#define VC4_PACKET_LINE_WIDTH_SIZE					5
+#define VC4_PACKET_RHT_X_BOUNDARY_SIZE					3
+#define VC4_PACKET_DEPTH_OFFSET_SIZE					5
+#define VC4_PACKET_CLIP_WINDOW_SIZE					9
+#define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
+#define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
+#define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
+#define VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE			11
+#define VC4_PACKET_CLEAR_COLORS_SIZE					14
+#define VC4_PACKET_TILE_COORDINATES_SIZE				3
+#define VC4_PACKET_GEM_HANDLES_SIZE					9
 
 #define VC4_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low))
 /* Using the GNU statement expression extension */
@@ -117,18 +149,19 @@ enum vc4_packet {
 
 /** @{
  *
- * byte 1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * byte 0-1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
  */
-#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 7)
-#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     (1 << 6)
-#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  (1 << 5)
-#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         (1 << 4)
-
-#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         (0 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    (1 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_BGR565           (2 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_MASK             (3 << 0)
+#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 15)
+#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     (1 << 14)
+#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  (1 << 13)
+#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         (1 << 12)
+
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      VC4_MASK(9, 8)
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     8
+#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         0
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    1
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565           2
 /** @} */
 
 /** @{
@@ -136,21 +169,24 @@ enum vc4_packet {
  * byte 0 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
  */
+#define VC4_STORE_TILE_BUFFER_MODE_MASK            VC4_MASK(7, 6)
+#define VC4_STORE_TILE_BUFFER_MODE_SHIFT           6
 #define VC4_STORE_TILE_BUFFER_MODE_SAMPLE0         (0 << 6)
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4     (1 << 6)
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16    (2 << 6)
 
 /** The values of the field are VC4_TILING_FORMAT_* */
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      (3 << 4)
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     4
-
-
-#define VC4_LOADSTORE_TILE_BUFFER_NONE             (0 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_COLOR            (1 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_ZS               (2 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_Z                (3 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          (4 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_FULL             (5 << 0)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_MASK      VC4_MASK(5, 4)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_SHIFT     4
+
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK      VC4_MASK(2, 0)
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_SHIFT     0
+#define VC4_LOADSTORE_TILE_BUFFER_NONE             0
+#define VC4_LOADSTORE_TILE_BUFFER_COLOR            1
+#define VC4_LOADSTORE_TILE_BUFFER_ZS               2
+#define VC4_LOADSTORE_TILE_BUFFER_Z                3
+#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          4
+#define VC4_LOADSTORE_TILE_BUFFER_FULL             5
 /** @} */
 
 #define VC4_INDEX_BUFFER_U8                        (0 << 4)
@@ -196,15 +232,19 @@ enum vc4_packet {
 /** @{ bits in the last u8 of VC4_PACKET_TILE_BINNING_MODE_CONFIG */
 #define VC4_BIN_CONFIG_DB_NON_MS                   (1 << 7)
 
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         (0 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         (1 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        (2 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        (3 << 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK       VC4_MASK(6, 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_SHIFT      5
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         0
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         1
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        2
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        3
 
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    (0 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    (1 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   (2 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   (3 << 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK  VC4_MASK(4, 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_SHIFT 3
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    0
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    1
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   2
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   3
 
 #define VC4_BIN_CONFIG_AUTO_INIT_TSDA              (1 << 2)
 #define VC4_BIN_CONFIG_TILE_BUFFER_64BIT           (1 << 1)
@@ -219,17 +259,18 @@ enum vc4_packet {
 #define VC4_RENDER_CONFIG_ENABLE_VG_MASK           (1 << 8)
 
 /** The values of the field are VC4_TILING_FORMAT_* */
-#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       (3 << 6)
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       VC4_MASK(7, 6)
 #define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT      6
 
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_1X         (0 << 4)
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_4X         (1 << 4)
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_16X        (2 << 4)
 
-#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   (0 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          (1 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_BGR565            (2 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_MASK              (3 << 2)
+#define VC4_RENDER_CONFIG_FORMAT_MASK              VC4_MASK(3, 2)
+#define VC4_RENDER_CONFIG_FORMAT_SHIFT             2
+#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   0
+#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          1
+#define VC4_RENDER_CONFIG_FORMAT_BGR565            2
 
 #define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT        (1 << 1)
 #define VC4_RENDER_CONFIG_MS_MODE_4X               (1 << 0)
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
new file mode 100644
index 00000000000..e2d907ad91f
--- /dev/null
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Render command list generation
+ *
+ * In the VC4 driver, render command list generation is performed by the
+ * kernel instead of userspace.  We do this because validating a
+ * user-submitted command list is hard to get right and has high CPU overhead,
+ * while the number of valid configurations for render command lists is
+ * actually fairly low.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+struct vc4_rcl_setup {
+	struct drm_gem_cma_object *color_read;
+	struct drm_gem_cma_object *color_ms_write;
+	struct drm_gem_cma_object *zs_read;
+	struct drm_gem_cma_object *zs_write;
+
+	struct drm_gem_cma_object *rcl;
+	u32 next_offset;
+};
+
+static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
+{
+	*(u8 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 1;
+}
+
+static inline void rcl_u16(struct vc4_rcl_setup *setup, u16 val)
+{
+	*(u16 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 2;
+}
+
+static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
+{
+	*(u32 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 4;
+}
+
+
+/*
+ * Emits a no-op STORE_TILE_BUFFER_GENERAL.
+ *
+ * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
+ * some sort before another load is triggered.
+ */
+static void vc4_store_before_load(struct vc4_rcl_setup *setup)
+{
+	rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+	rcl_u16(setup,
+		VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_NONE,
+			      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+		VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR);
+	rcl_u32(setup, 0); /* no address, since we're in None mode */
+}
+
+/*
+ * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
+ *
+ * The tile coordinates packet triggers a pending load if there is one, are
+ * used for clipping during rendering, and determine where loads/stores happen
+ * relative to their base address.
+ */
+static void vc4_tile_coordinates(struct vc4_rcl_setup *setup,
+				 uint32_t x, uint32_t y)
+{
+	rcl_u8(setup, VC4_PACKET_TILE_COORDINATES);
+	rcl_u8(setup, x);
+	rcl_u8(setup, y);
+}
+
+static void emit_tile(struct vc4_exec_info *exec,
+		      struct vc4_rcl_setup *setup,
+		      uint8_t x, uint8_t y, bool first, bool last)
+{
+	bool has_bin = exec->args->bin_cl_size != 0;
+
+	/* Note that the load doesn't actually occur until the
+	 * tile coords packet is processed, and only one load
+	 * may be outstanding at a time.
+	 */
+	if (setup->color_read) {
+		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u32(setup,
+			setup->color_read->paddr +
+			exec->args->color_read.offset);
+	}
+
+	if (setup->zs_read) {
+		if (setup->color_read) {
+			/* Exec previous load. */
+			vc4_tile_coordinates(setup, x, y);
+			vc4_store_before_load(setup);
+		}
+
+		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->zs_read.bits);
+		rcl_u32(setup,
+			setup->zs_read->paddr + exec->args->zs_read.offset);
+	}
+
+	/* Clipping depends on tile coordinates having been
+	 * emitted, so we always need one here.
+	 */
+	vc4_tile_coordinates(setup, x, y);
+
+	/* Wait for the binner before jumping to the first
+	 * tile's lists.
+	 */
+	if (first && has_bin)
+		rcl_u8(setup, VC4_PACKET_WAIT_ON_SEMAPHORE);
+
+	if (has_bin) {
+		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
+		rcl_u32(setup, (exec->tile_bo->paddr +
+				exec->tile_alloc_offset +
+				(y * exec->bin_tiles_x + x) * 32));
+	}
+
+	if (setup->zs_write) {
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->zs_write.bits |
+			(setup->color_ms_write ?
+			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
+		rcl_u32(setup,
+			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			((last && !setup->color_ms_write) ?
+			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
+	}
+
+	if (setup->color_ms_write) {
+		if (setup->zs_write) {
+			/* Reset after previous store */
+			vc4_tile_coordinates(setup, x, y);
+		}
+
+		if (last)
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
+		else
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER);
+	}
+}
+
+static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
+			     struct vc4_rcl_setup *setup)
+{
+	bool has_bin = exec->args->bin_cl_size != 0;
+	uint8_t min_x_tile = exec->args->min_x_tile;
+	uint8_t min_y_tile = exec->args->min_y_tile;
+	uint8_t max_x_tile = exec->args->max_x_tile;
+	uint8_t max_y_tile = exec->args->max_y_tile;
+	uint8_t xtiles = max_x_tile - min_x_tile + 1;
+	uint8_t ytiles = max_y_tile - min_y_tile + 1;
+	uint8_t x, y;
+	uint32_t size, loop_body_size;
+
+	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
+	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
+
+	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		size += VC4_PACKET_CLEAR_COLORS_SIZE +
+			VC4_PACKET_TILE_COORDINATES_SIZE +
+			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (setup->color_read) {
+		loop_body_size += (VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE);
+	}
+	if (setup->zs_read) {
+		if (setup->color_read) {
+			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+			loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+		}
+		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (has_bin) {
+		size += VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE;
+		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
+	}
+
+	if (setup->zs_write)
+		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+	if (setup->color_ms_write) {
+		if (setup->zs_write)
+			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
+	}
+	size += xtiles * ytiles * loop_body_size;
+
+	setup->rcl = drm_gem_cma_create(dev, size);
+	if (!setup->rcl)
+		return -ENOMEM;
+	list_addtail(&to_vc4_bo(&setup->rcl->base)->unref_head,
+		     &exec->unref_list);
+
+	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+	rcl_u32(setup,
+		(setup->color_ms_write ?
+		 (setup->color_ms_write->paddr +
+		  exec->args->color_ms_write.offset) :
+		 0));
+	rcl_u16(setup, exec->args->width);
+	rcl_u16(setup, exec->args->height);
+	rcl_u16(setup, exec->args->color_ms_write.bits);
+
+	/* The tile buffer gets cleared when the previous tile is stored.  If
+	 * the clear values changed between frames, then the tile buffer has
+	 * stale clear values in it, so we have to do a store in None mode (no
+	 * writes) so that we trigger the tile buffer clear.
+	 */
+	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
+		rcl_u32(setup, exec->args->clear_color[0]);
+		rcl_u32(setup, exec->args->clear_color[1]);
+		rcl_u32(setup, exec->args->clear_z);
+		rcl_u8(setup, exec->args->clear_s);
+
+		vc4_tile_coordinates(setup, 0, 0);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, VC4_LOADSTORE_TILE_BUFFER_NONE);
+		rcl_u32(setup, 0); /* no address, since we're in None mode */
+	}
+
+	for (y = min_y_tile; y <= max_y_tile; y++) {
+		for (x = min_x_tile; x <= max_x_tile; x++) {
+			bool first = (x == min_x_tile && y == min_y_tile);
+			bool last = (x == max_x_tile && y == max_y_tile);
+			emit_tile(exec, setup, x, y, first, last);
+		}
+	}
+
+	BUG_ON(setup->next_offset != size);
+	exec->ct1ca = setup->rcl->paddr;
+	exec->ct1ea = setup->rcl->paddr + setup->next_offset;
+
+	return 0;
+}
+
+static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
+				 struct drm_gem_cma_object **obj,
+				 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_TILING);
+	uint8_t buffer = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+	int cpp;
+
+	if (surf->pad != 0) {
+		DRM_ERROR("Padding unset\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in load/store: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	if (buffer == VC4_LOADSTORE_TILE_BUFFER_ZS) {
+		if (format != 0) {
+			DRM_ERROR("No color format should be set for ZS\n");
+			return -EINVAL;
+		}
+		cpp = 4;
+	} else if (buffer == VC4_LOADSTORE_TILE_BUFFER_COLOR) {
+		switch (format) {
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565:
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER:
+			cpp = 2;
+			break;
+		case VC4_LOADSTORE_TILE_BUFFER_RGBA8888:
+			cpp = 4;
+			break;
+		default:
+			DRM_ERROR("Bad tile buffer format\n");
+			return -EINVAL;
+		}
+	} else {
+		DRM_ERROR("Bad load/store buffer %d.\n", buffer);
+		return -EINVAL;
+	}
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("load/store buffer must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
+			 struct drm_gem_cma_object **obj,
+			 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_FORMAT);
+	int cpp;
+
+	if (surf->pad != 0) {
+		DRM_ERROR("Padding unset\n");
+		return -EINVAL;
+	}
+
+	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	switch (format) {
+	case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED:
+	case VC4_RENDER_CONFIG_FORMAT_BGR565:
+		cpp = 2;
+		break;
+	case VC4_RENDER_CONFIG_FORMAT_RGBA8888:
+		cpp = 4;
+		break;
+	default:
+		DRM_ERROR("Bad tile buffer format\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_rcl_setup setup = {0};
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	int ret;
+
+	if (args->min_x_tile > args->max_x_tile ||
+	    args->min_y_tile > args->max_y_tile) {
+		DRM_ERROR("Bad render tile set (%d,%d)-(%d,%d)\n",
+			  args->min_x_tile, args->min_y_tile,
+			  args->max_x_tile, args->max_y_tile);
+		return -EINVAL;
+	}
+
+	if (has_bin &&
+	    (args->max_x_tile > exec->bin_tiles_x ||
+	     args->max_y_tile > exec->bin_tiles_y)) {
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config (%d,%d)\n",
+			  args->max_x_tile, args->max_y_tile,
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_ms_surface_setup(exec, &setup.color_ms_write,
+				       &args->color_ms_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
+	if (ret)
+		return ret;
+
+	/* We shouldn't even have the job submitted to us if there's no
+	 * surface to write out.
+	 */
+	if (!setup.color_ms_write && !setup.zs_write) {
+		DRM_ERROR("RCL requires color or Z/S write\n");
+		return -EINVAL;
+	}
+
+	return vc4_create_rcl_bo(dev, exec, &setup);
+}
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 2d04a4a7b9a..a0b67a7e50b 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,7 +94,7 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-static bool
+bool
 vc4_use_bo(struct vc4_exec_info *exec,
 	   uint32_t hindex,
 	   enum vc4_bo_mode mode,
@@ -147,33 +147,39 @@ gl_shader_rec_size(uint32_t pointer_bits)
 		return 36 + attribute_count * 8;
 }
 
-static bool
-check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
-	       uint32_t offset, uint8_t tiling_format,
-	       uint32_t width, uint32_t height, uint8_t cpp)
+bool
+vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
+		   uint32_t offset, uint8_t tiling_format,
+		   uint32_t width, uint32_t height, uint8_t cpp)
 {
 	uint32_t aligned_width, aligned_height, stride, size;
 	uint32_t utile_w = utile_width(cpp);
 	uint32_t utile_h = utile_height(cpp);
 
-	/* The values are limited by the packet/texture parameter bitfields,
-	 * so we don't need to worry as much about integer overflow.
+	/* The shaded vertex format stores signed 12.4 fixed point
+	 * (-2048,2047) offsets from the viewport center, so we should
+	 * never have a render target larger than 4096.  The texture
+	 * unit can only sample from 2048x2048, so it's even more
+	 * restricted.  This lets us avoid worrying about overflow in
+	 * our math.
 	 */
-	BUG_ON(width > 65535);
-	BUG_ON(height > 65535);
+	if (width > 4096 || height > 4096) {
+		DRM_ERROR("Surface dimesions (%d,%d) too large", width, height);
+		return false;
+	}
 
 	switch (tiling_format) {
 	case VC4_TILING_FORMAT_LINEAR:
-		aligned_width = roundup(width, utile_w);
+		aligned_width = round_up(width, utile_w);
 		aligned_height = height;
 		break;
 	case VC4_TILING_FORMAT_T:
-		aligned_width = roundup(width, utile_w * 8);
-		aligned_height = roundup(height, utile_h * 8);
+		aligned_width = round_up(width, utile_w * 8);
+		aligned_height = round_up(height, utile_h * 8);
 		break;
 	case VC4_TILING_FORMAT_LT:
-		aligned_width = roundup(width, utile_w);
-		aligned_height = roundup(height, utile_h);
+		aligned_width = round_up(width, utile_w);
+		aligned_height = round_up(height, utile_h);
 		break;
 	default:
 		DRM_ERROR("buffer tiling %d unsupported\n", tiling_format);
@@ -181,13 +187,6 @@ check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	}
 
 	stride = aligned_width * cpp;
-
-	if (INT_MAX / stride < aligned_height) {
-		DRM_ERROR("Overflow in fbo size (%dx%d -> %dx%d)\n",
-			  width, height,
-			  aligned_width, aligned_height);
-		return false;
-	}
 	size = stride * aligned_height;
 
 	if (size + offset < size ||
@@ -249,122 +248,6 @@ validate_increment_semaphore(VALIDATE_ARGS)
 }
 
 static int
-validate_wait_on_semaphore(VALIDATE_ARGS)
-{
-	if (exec->found_wait_on_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_WAIT_ON_SEMAPHORE\n");
-		return -EINVAL;
-	}
-	exec->found_wait_on_semaphore_packet = true;
-
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_WAIT_ON_SEMAPHORE without "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int
-validate_branch_to_sublist(VALIDATE_ARGS)
-{
-	struct drm_gem_cma_object *target;
-	uint32_t offset;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &target))
-		return -EINVAL;
-
-	if (target != exec->tile_alloc_bo) {
-		DRM_ERROR("Jumping to BOs other than tile alloc unsupported\n");
-		return -EINVAL;
-	}
-
-	if (!exec->found_wait_on_semaphore_packet) {
-		DRM_ERROR("Jumping to tile alloc before binning finished.\n");
-		return -EINVAL;
-	}
-
-	offset = *(uint32_t *)(untrusted + 0);
-	if (offset % exec->tile_alloc_init_block_size ||
-	    offset / exec->tile_alloc_init_block_size >=
-	    exec->bin_tiles_x * exec->bin_tiles_y) {
-		DRM_ERROR("VC4_PACKET_BRANCH_TO_SUB_LIST must jump to initial "
-			  "tile allocation space.\n");
-		return -EINVAL;
-	}
-
-	*(uint32_t *)(validated + 0) = target->paddr + offset;
-
-	return 0;
-}
-
-/**
- * validate_loadstore_tile_buffer_general() - Validation for
- * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL and
- * VC4_PACKET_STORE_TILE_BUFFER_GENERAL.
- *
- * The two packets are nearly the same, except for the TLB-clearing management
- * bits not being present for loads.  Additionally, while stores are executed
- * immediately (using the current tile coordinates), loads are queued to be
- * executed when the tile coordinates packet occurs.
- *
- * Note that coordinates packets are validated to be within the declared
- * bin_x/y, which themselves are verified to match the rendering-configuration
- * FB width and height (which the hardware uses to clip loads and stores).
- */
-static int
-validate_loadstore_tile_buffer_general(VALIDATE_ARGS)
-{
-	uint32_t packet_b0 = *(uint8_t *)(untrusted + 0);
-	uint32_t packet_b1 = *(uint8_t *)(untrusted + 1);
-	struct drm_gem_cma_object *fbo;
-	uint32_t buffer_type = packet_b0 & 0xf;
-	uint32_t untrusted_address, offset, cpp;
-
-	switch (buffer_type) {
-	case VC4_LOADSTORE_TILE_BUFFER_NONE:
-		return 0;
-	case VC4_LOADSTORE_TILE_BUFFER_COLOR:
-		if ((packet_b1 & VC4_LOADSTORE_TILE_BUFFER_MASK) ==
-		    VC4_LOADSTORE_TILE_BUFFER_RGBA8888) {
-			cpp = 4;
-		} else {
-			cpp = 2;
-		}
-		break;
-
-	case VC4_LOADSTORE_TILE_BUFFER_Z:
-	case VC4_LOADSTORE_TILE_BUFFER_ZS:
-		cpp = 4;
-		break;
-
-	default:
-		DRM_ERROR("Load/store type %d unsupported\n", buffer_type);
-		return -EINVAL;
-	}
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
-		return -EINVAL;
-
-	untrusted_address = *(uint32_t *)(untrusted + 2);
-	offset = untrusted_address & ~0xf;
-
-	if (!check_tex_size(exec, fbo, offset,
-			    ((packet_b0 &
-			      VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK) >>
-			     VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT),
-			    exec->fb_width, exec->fb_height, cpp)) {
-		return -EINVAL;
-	}
-
-	*(uint32_t *)(validated + 2) = (offset + fbo->paddr +
-					(untrusted_address & 0xf));
-
-	return 0;
-}
-
-static int
 validate_indexed_prim_list(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *ib;
@@ -492,14 +375,10 @@ validate_nv_shader_state(VALIDATE_ARGS)
 static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
-	struct drm_gem_cma_object *tile_allocation;
-	struct drm_gem_cma_object *tile_state_data_array;
+	struct drm_device *dev = exec->exec_bo->base.dev;
 	uint8_t flags;
-	uint32_t tile_allocation_size;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &tile_allocation) ||
-	    !vc4_use_handle(exec, 1, VC4_MODE_TSDA, &tile_state_data_array))
-		return -EINVAL;
+	uint32_t tile_state_size, tile_alloc_size;
+	uint32_t tile_count;
 
 	if (exec->found_tile_binning_mode_config_packet) {
 		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -509,6 +388,7 @@ validate_tile_binning_config(VALIDATE_ARGS)
 
 	exec->bin_tiles_x = *(uint8_t *)(untrusted + 12);
 	exec->bin_tiles_y = *(uint8_t *)(untrusted + 13);
+	tile_count = exec->bin_tiles_x * exec->bin_tiles_y;
 	flags = *(uint8_t *)(untrusted + 14);
 
 	if (exec->bin_tiles_x == 0 ||
@@ -518,15 +398,6 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	/* Our validation relies on the user not getting to set up their own
-	 * tile state/tile allocation BO contents.
-	 */
-	if (!(flags & VC4_BIN_CONFIG_AUTO_INIT_TSDA)) {
-		DRM_ERROR("binning config missing "
-			  "VC4_BIN_CONFIG_AUTO_INIT_TSDA\n");
-		return -EINVAL;
-	}
-
 	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
 		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT |
 		     VC4_BIN_CONFIG_MS_MODE_4X)) {
@@ -534,94 +405,52 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	if (*(uint32_t *)(untrusted + 0) != 0) {
-		DRM_ERROR("tile allocation offset != 0 unsupported\n");
-		return -EINVAL;
-	}
-	tile_allocation_size = *(uint32_t *)(untrusted + 4);
-	if (tile_allocation_size > tile_allocation->base.size) {
-		DRM_ERROR("tile allocation size %d > BO size %d\n",
-			  tile_allocation_size, tile_allocation->base.size);
-		return -EINVAL;
-	}
-	*(uint32_t *)validated = tile_allocation->paddr;
-	exec->tile_alloc_bo = tile_allocation;
-
-	exec->tile_alloc_init_block_size = 1 << (5 + ((flags >> 5) & 3));
-	if (exec->bin_tiles_x * exec->bin_tiles_y *
-	    exec->tile_alloc_init_block_size > tile_allocation_size) {
-		DRM_ERROR("tile init exceeds tile alloc size (%d vs %d)\n",
-			  exec->bin_tiles_x * exec->bin_tiles_y *
-			  exec->tile_alloc_init_block_size,
-			  tile_allocation_size);
-		return -EINVAL;
-	}
-	if (*(uint32_t *)(untrusted + 8) != 0) {
-		DRM_ERROR("TSDA offset != 0 unsupported\n");
-		return -EINVAL;
-	}
-	if (exec->bin_tiles_x * exec->bin_tiles_y * 48 >
-	    tile_state_data_array->base.size) {
-		DRM_ERROR("TSDA of %db too small for %dx%d bin config\n",
-			  tile_state_data_array->base.size,
-			  exec->bin_tiles_x, exec->bin_tiles_y);
-	}
-	*(uint32_t *)(validated + 8) = tile_state_data_array->paddr;
-
-	return 0;
-}
-
-static int
-validate_tile_rendering_mode_config(VALIDATE_ARGS)
-{
-	struct drm_gem_cma_object *fbo;
-	uint32_t flags, offset, cpp;
-
-	if (exec->found_tile_rendering_mode_config_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
-		return -EINVAL;
-	}
-	exec->found_tile_rendering_mode_config_packet = true;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
-		return -EINVAL;
-
-	exec->fb_width = *(uint16_t *)(untrusted + 4);
-	exec->fb_height = *(uint16_t *)(untrusted + 6);
-
-	flags = *(uint16_t *)(untrusted + 8);
-	if ((flags & VC4_RENDER_CONFIG_FORMAT_MASK) ==
-	    VC4_RENDER_CONFIG_FORMAT_RGBA8888) {
-		cpp = 4;
-	} else {
-		cpp = 2;
-	}
-
-	offset = *(uint32_t *)untrusted;
-	if (!check_tex_size(exec, fbo, offset,
-			    ((flags &
-			      VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK) >>
-			     VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT),
-			    exec->fb_width, exec->fb_height, cpp)) {
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = fbo->paddr + offset;
-
-	return 0;
-}
-
-static int
-validate_tile_coordinates(VALIDATE_ARGS)
-{
-	uint8_t tile_x = *(uint8_t *)(untrusted + 0);
-	uint8_t tile_y = *(uint8_t *)(untrusted + 1);
+	/* The tile state data array is 48 bytes per tile, and we put it at
+	 * the start of a BO containing both it and the tile alloc.
+	 */
+	tile_state_size = 48 * tile_count;
+
+	/* Since the tile alloc array will follow us, align. */
+	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+
+	*(uint8_t *)(validated + 14) =
+		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
+			    VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK)) |
+		 VC4_BIN_CONFIG_AUTO_INIT_TSDA |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32,
+			       VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE) |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
+			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
+
+	/* Initial block size. */
+	tile_alloc_size = 32 * tile_count;
+
+	/*
+	 * The initial allocation gets rounded to the next 256 bytes before
+	 * the hardware starts fulfilling further allocations.
+	 */
+	tile_alloc_size = roundup(tile_alloc_size, 256);
 
-	if (tile_x * 64 >= exec->fb_width || tile_y * 64 >= exec->fb_height) {
-		DRM_ERROR("Tile coordinates %d,%d > render config %dx%d\n",
-			  tile_x, tile_y, exec->fb_width, exec->fb_height);
-		return -EINVAL;
-	}
+	/* Add space for the extra allocations.  This is what gets used first,
+	 * before overflow memory.  It must have at least 4096 bytes, but we
+	 * want to avoid overflow memory usage if possible.
+	 */
+	tile_alloc_size += 1024 * 1024;
+
+	exec->tile_bo = drm_gem_cma_create(dev, exec->tile_alloc_offset +
+					   tile_alloc_size);
+	if (!exec->tile_bo)
+		return -ENOMEM;
+	list_addtail(&to_vc4_bo(&exec->tile_bo->base)->unref_head,
+		     &exec->unref_list);
+
+	/* tile alloc address. */
+	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
+					exec->tile_alloc_offset);
+	/* tile alloc size. */
+	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	/* tile state address. */
+	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
 
 	return 0;
 }
@@ -633,78 +462,60 @@ validate_gem_handles(VALIDATE_ARGS)
 	return 0;
 }
 
+#define VC4_DEFINE_PACKET(packet, name, func) \
+	[packet] = { packet ## _SIZE, name, func }
+
 static const struct cmd_info {
-	bool bin;
-	bool render;
 	uint16_t len;
 	const char *name;
 	int (*func)(struct vc4_exec_info *exec, void *validated,
 		    void *untrusted);
 } cmd_info[] = {
-	[VC4_PACKET_HALT] = { 1, 1, 1, "halt", NULL },
-	[VC4_PACKET_NOP] = { 1, 1, 1, "nop", NULL },
-	[VC4_PACKET_FLUSH] = { 1, 1, 1, "flush", NULL },
-	[VC4_PACKET_FLUSH_ALL] = { 1, 0, 1, "flush all state", validate_flush_all },
-	[VC4_PACKET_START_TILE_BINNING] = { 1, 0, 1, "start tile binning", validate_start_tile_binning },
-	[VC4_PACKET_INCREMENT_SEMAPHORE] = { 1, 0, 1, "increment semaphore", validate_increment_semaphore },
-	[VC4_PACKET_WAIT_ON_SEMAPHORE] = { 0, 1, 1, "wait on semaphore", validate_wait_on_semaphore },
-	/* BRANCH_TO_SUB_LIST is actually supported in the binner as well, but
-	 * we only use it from the render CL in order to jump into the tile
-	 * allocation BO.
-	 */
-	[VC4_PACKET_BRANCH_TO_SUB_LIST] = { 0, 1, 5, "branch to sublist", validate_branch_to_sublist },
-	[VC4_PACKET_STORE_MS_TILE_BUFFER] = { 0, 1, 1, "store MS resolved tile color buffer", NULL },
-	[VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF] = { 0, 1, 1, "store MS resolved tile color buffer and EOF", NULL },
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
-	[VC4_PACKET_STORE_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Store Tile Buffer General", validate_loadstore_tile_buffer_general },
-	[VC4_PACKET_LOAD_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Load Tile Buffer General", validate_loadstore_tile_buffer_general },
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, "Indexed Primitive List", validate_indexed_prim_list),
 
-	[VC4_PACKET_GL_INDEXED_PRIMITIVE] = { 1, 1, 14, "Indexed Primitive List", validate_indexed_prim_list },
-
-	[VC4_PACKET_GL_ARRAY_PRIMITIVE] = { 1, 1, 10, "Vertex Array Primitives", validate_gl_array_primitive },
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, "Vertex Array Primitives", validate_gl_array_primitive),
 
 	/* This is only used by clipped primitives (packets 48 and 49), which
 	 * we don't support parsing yet.
 	 */
-	[VC4_PACKET_PRIMITIVE_LIST_FORMAT] = { 1, 1, 2, "primitive list format", NULL },
-
-	[VC4_PACKET_GL_SHADER_STATE] = { 1, 1, 5, "GL Shader State", validate_gl_shader_state },
-	[VC4_PACKET_NV_SHADER_STATE] = { 1, 1, 5, "NV Shader State", validate_nv_shader_state },
-
-	[VC4_PACKET_CONFIGURATION_BITS] = { 1, 1, 4, "configuration bits", NULL },
-	[VC4_PACKET_FLAT_SHADE_FLAGS] = { 1, 1, 5, "flat shade flags", NULL },
-	[VC4_PACKET_POINT_SIZE] = { 1, 1, 5, "point size", NULL },
-	[VC4_PACKET_LINE_WIDTH] = { 1, 1, 5, "line width", NULL },
-	[VC4_PACKET_RHT_X_BOUNDARY] = { 1, 1, 3, "RHT X boundary", NULL },
-	[VC4_PACKET_DEPTH_OFFSET] = { 1, 1, 5, "Depth Offset", NULL },
-	[VC4_PACKET_CLIP_WINDOW] = { 1, 1, 9, "Clip Window", NULL },
-	[VC4_PACKET_VIEWPORT_OFFSET] = { 1, 1, 5, "Viewport Offset", NULL },
-	[VC4_PACKET_CLIPPER_XY_SCALING] = { 1, 1, 9, "Clipper XY Scaling", NULL },
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
+	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, "point size", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, "line width", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, "RHT X boundary", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, "Depth Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, "Clip Window", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, "Viewport Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, "Clipper XY Scaling", NULL),
 	/* Note: The docs say this was also 105, but it was 106 in the
 	 * initial userland code drop.
 	 */
-	[VC4_PACKET_CLIPPER_Z_SCALING] = { 1, 1, 9, "Clipper Z Scale and Offset", NULL },
-
-	[VC4_PACKET_TILE_BINNING_MODE_CONFIG] = { 1, 0, 16, "tile binning configuration", validate_tile_binning_config },
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, "Clipper Z Scale and Offset", NULL),
 
-	[VC4_PACKET_TILE_RENDERING_MODE_CONFIG] = { 0, 1, 11, "tile rendering mode configuration", validate_tile_rendering_mode_config},
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, "tile binning configuration", validate_tile_binning_config),
 
-	[VC4_PACKET_CLEAR_COLORS] = { 0, 1, 14, "Clear Colors", NULL },
-
-	[VC4_PACKET_TILE_COORDINATES] = { 0, 1, 3, "Tile Coordinates", validate_tile_coordinates },
-
-	[VC4_PACKET_GEM_HANDLES] = { 1, 1, 9, "GEM handles", validate_gem_handles },
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, "GEM handles", validate_gem_handles),
 };
 
 int
-vc4_validate_cl(struct drm_device *dev,
-		void *validated,
-		void *unvalidated,
-		uint32_t len,
-		bool is_bin,
-		bool has_bin,
-		struct vc4_exec_info *exec)
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec)
 {
+	uint32_t len = exec->args->bin_cl_size;
 	uint32_t dst_offset = 0;
 	uint32_t src_offset = 0;
 
@@ -732,14 +543,6 @@ vc4_validate_cl(struct drm_device *dev,
 			 src_offset, cmd, info->name, info->len);
 #endif
 
-		if ((is_bin && !info->bin) ||
-		    (!is_bin && !info->render)) {
-			DRM_ERROR("0x%08x: packet %d (%s) invalid for %s\n",
-				  src_offset, cmd, info->name,
-				  is_bin ? "binner" : "render");
-			return -EINVAL;
-		}
-
 		if (src_offset + info->len > len) {
 			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
 				  "exceeds bounds (0x%08x)\n",
@@ -770,30 +573,16 @@ vc4_validate_cl(struct drm_device *dev,
 			break;
 	}
 
-	if (is_bin) {
-		exec->ct0ea = exec->ct0ca + dst_offset;
+	exec->ct0ea = exec->ct0ca + dst_offset;
 
-		if (has_bin && !exec->found_start_tile_binning_packet) {
-			DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
-			return -EINVAL;
-		}
-	} else {
-		if (!exec->found_tile_rendering_mode_config_packet) {
-			DRM_ERROR("Render CL missing VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
-			return -EINVAL;
-		}
+	if (!exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
 
-		/* Make sure that they actually consumed the semaphore
-		 * increment from the bin CL.  Otherwise a later submit would
-		 * have render execute immediately.
-		 */
-		if (exec->found_wait_on_semaphore_packet != has_bin) {
-			DRM_ERROR("Render CL %s VC4_PACKET_WAIT_ON_SEMAPHORE\n",
-				  exec->found_wait_on_semaphore_packet ?
-				  "has" : "missing");
-			return -EINVAL;
-		}
-		exec->ct1ea = exec->ct1ca + dst_offset;
+	if (!exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
 	}
 
 	return 0;
@@ -814,10 +603,10 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t p3 = (sample->p_offset[3] != ~0 ?
 		       *(uint32_t *)(uniform_data_u + sample->p_offset[3]) : 0);
 	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
-	uint32_t offset = p0 & ~0xfff;
-	uint32_t miplevels = (p0 & 15);
-	uint32_t width = (p1 >> 8) & 2047;
-	uint32_t height = (p1 >> 20) & 2047;
+	uint32_t offset = p0 & VC4_TEX_P0_OFFSET_MASK;
+	uint32_t miplevels = VC4_GET_FIELD(p0, VC4_TEX_P0_MIPLVLS);
+	uint32_t width = VC4_GET_FIELD(p1, VC4_TEX_P1_WIDTH);
+	uint32_t height = VC4_GET_FIELD(p1, VC4_TEX_P1_HEIGHT);
 	uint32_t cpp, tiling_format, utile_w, utile_h;
 	uint32_t i;
 	uint32_t cube_map_stride = 0;
@@ -845,16 +634,18 @@ reloc_tex(struct vc4_exec_info *exec,
 	if (height == 0)
 		height = 2048;
 
-	if (p0 & (1 << 9)) {
-		if ((p2 & (3 << 30)) == (1 << 30))
-			cube_map_stride = p2 & 0x3ffff000;
-		if ((p3 & (3 << 30)) == (1 << 30)) {
+	if (p0 & VC4_TEX_P0_CMMODE_MASK) {
+		if (VC4_GET_FIELD(p2, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE)
+			cube_map_stride = p2 & VC4_TEX_P2_CMST_MASK;
+		if (VC4_GET_FIELD(p3, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
 				return false;
 			}
 
-			cube_map_stride = p3 & 0x3ffff000;
+			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
@@ -862,7 +653,8 @@ reloc_tex(struct vc4_exec_info *exec,
 		}
 	}
 
-	type = ((p0 >> 4) & 15) | ((p1 >> 31) << 4);
+	type = (VC4_GET_FIELD(p0, VC4_TEX_P0_TYPE) |
+		(VC4_GET_FIELD(p1, VC4_TEX_P1_TYPE4) << 4));
 
 	switch (type) {
 	case VC4_TEXTURE_TYPE_RGBA8888:
@@ -905,8 +697,8 @@ reloc_tex(struct vc4_exec_info *exec,
 			tiling_format = VC4_TILING_FORMAT_T;
 	}
 
-	if (!check_tex_size(exec, tex, offset + cube_map_stride * 5,
-			    tiling_format, width, height, cpp)) {
+	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
+				tiling_format, width, height, cpp)) {
 		return false;
 	}
 
@@ -927,15 +719,15 @@ reloc_tex(struct vc4_exec_info *exec,
 
 		switch (tiling_format) {
 		case VC4_TILING_FORMAT_T:
-			aligned_width = roundup(level_width, utile_w * 8);
-			aligned_height = roundup(level_height, utile_h * 8);
+			aligned_width = round_up(level_width, utile_w * 8);
+			aligned_height = round_up(level_height, utile_h * 8);
 			break;
 		case VC4_TILING_FORMAT_LT:
-			aligned_width = roundup(level_width, utile_w);
-			aligned_height = roundup(level_height, utile_h);
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = round_up(level_height, utile_h);
 			break;
 		default:
-			aligned_width = roundup(level_width, utile_w);
+			aligned_width = round_up(level_width, utile_w);
 			aligned_height = level_height;
 			break;
 		}
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
index e5a75c5f8c2..ab9a6512e82 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -58,7 +58,8 @@ struct vc4_shader_validation_state {
 	 *
 	 * This is used for the validation of direct address memory reads.
 	 */
-	uint32_t live_clamp_offsets[32 + 32 + 4];
+	uint32_t live_min_clamp_offsets[32 + 32 + 4];
+	bool live_max_clamp_regs[32 + 32 + 4];
 };
 
 static uint32_t
@@ -77,6 +78,25 @@ waddr_to_live_reg_index(uint32_t waddr, bool is_b)
 	}
 }
 
+static uint32_t
+raddr_add_a_to_live_reg_index(uint64_t inst)
+{
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (add_a == QPU_MUX_A) {
+		return raddr_a;
+	} else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) {
+		return 32 + raddr_b;
+	} else if (add_a <= QPU_MUX_R3) {
+		return 64 + add_a;
+	} else {
+		return ~0;
+	}
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -136,9 +156,8 @@ check_tmu_write(uint64_t inst,
 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
 	if (is_direct) {
-		uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
 		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
-		uint32_t clamp_offset = ~0;
+		uint32_t clamp_reg, clamp_offset;
 
 		if (sig == QPU_SIG_SMALL_IMM) {
 			DRM_ERROR("direct TMU read used small immediate\n");
@@ -159,14 +178,13 @@ check_tmu_write(uint64_t inst,
 		 * This is arbitrary, but simpler than supporting flipping the
 		 * two either way.
 		 */
-		if (add_a == QPU_MUX_A) {
-			clamp_offset = validation_state->live_clamp_offsets[raddr_a];
-		} else if (add_a == QPU_MUX_B) {
-			clamp_offset = validation_state->live_clamp_offsets[32 + raddr_b];
-		} else if (add_a <= QPU_MUX_R4) {
-			clamp_offset = validation_state->live_clamp_offsets[64 + add_a];
+		clamp_reg = raddr_add_a_to_live_reg_index(inst);
+		if (clamp_reg == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
 		}
 
+		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
 		if (clamp_offset == ~0) {
 			DRM_ERROR("direct TMU load wasn't clamped\n");
 			return false;
@@ -229,8 +247,6 @@ check_register_write(uint64_t inst,
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
-	bool is_b = is_mul != ((inst & QPU_WS) != 0);
-	uint32_t live_reg_index;
 
 	switch (waddr) {
 	case QPU_W_UNIFORMS_ADDRESS:
@@ -285,14 +301,6 @@ check_register_write(uint64_t inst,
                 return true;
 	}
 
-	/* Clear out the live offset clamp tracking for the written register.
-	 * If this particular instruction is setting up an offset clamp, it'll
-	 * get tracked immediately after we return.
-	 */
-	live_reg_index = waddr_to_live_reg_index(waddr, is_b);
-	if (live_reg_index != ~0)
-		validation_state->live_clamp_offsets[live_reg_index] = ~0;
-
 	return true;
 }
 
@@ -301,26 +309,72 @@ track_live_clamps(uint64_t inst,
 		  struct vc4_validated_shader_info *validated_shader,
 		  struct vc4_shader_validation_state *validation_state)
 {
+	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
 	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
-	bool is_b = inst & QPU_WS;
-	uint32_t live_reg_index;
+	bool ws = inst & QPU_WS;
+	uint32_t lri_add_a, lri_add, lri_mul;
+	bool add_a_is_min_0;
 
-	if (QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_MIN)
+	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
+	 * before we clear previous live state.
+	 */
+	lri_add_a = raddr_add_a_to_live_reg_index(inst);
+	add_a_is_min_0 = (lri_add_a != ~0 &&
+			  validation_state->live_max_clamp_regs[lri_add_a]);
+
+	/* Clear live state for registers written by our instruction. */
+	lri_add = waddr_to_live_reg_index(waddr_add, ws);
+	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
+	if (lri_mul != ~0) {
+		validation_state->live_max_clamp_regs[lri_mul] = false;
+		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
+	}
+	if (lri_add != ~0) {
+		validation_state->live_max_clamp_regs[lri_add] = false;
+		validation_state->live_min_clamp_offsets[lri_add] = ~0;
+	} else {
+		/* Nothing further to do for live tracking, since only ADDs
+		 * generate new live clamp registers.
+		 */
 		return;
+	}
+
+	/* Now, handle remaining live clamp tracking for the ADD operation. */
 
-	if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
-	    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
-	      sig != QPU_SIG_SMALL_IMM)) {
+	if (cond_add != QPU_COND_ALWAYS)
 		return;
-	}
 
-	live_reg_index = waddr_to_live_reg_index(waddr_add, is_b);
-	if (live_reg_index != ~0) {
-		validation_state->live_clamp_offsets[live_reg_index] =
+	if (op_add == QPU_A_MAX) {
+		/* Track live clamps of a value to a minimum of 0 (in either
+		 * arg).
+		 */
+		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
+		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
+			return;
+		}
+
+		validation_state->live_max_clamp_regs[lri_add] = true;
+	} if (op_add == QPU_A_MIN) {
+		/* Track live clamps of a value clamped to a minimum of 0 and
+		 * a maximum of some uniform's offset.
+		 */
+		if (!add_a_is_min_0)
+			return;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+		      sig != QPU_SIG_SMALL_IMM)) {
+			return;
+		}
+
+		validation_state->live_min_clamp_offsets[lri_add] =
 			validated_shader->uniforms_size;
 	}
 }
@@ -382,8 +436,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 
 	for (i = 0; i < 8; i++)
 		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
-	for (i = 0; i < ARRAY_SIZE(validation_state.live_clamp_offsets); i++)
-		validation_state.live_clamp_offsets[i] = ~0;
+	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
+		validation_state.live_min_clamp_offsets[i] = ~0;
 
 	shader = shader_obj->vaddr;
 	max_ip = shader_obj->base.size / sizeof(uint64_t);
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index 2d524c40b4d..d29e2c9c318 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -26,86 +26,7 @@
 #include "util/u_blitter.h"
 #include "vc4_context.h"
 
-static void
-vc4_tile_blit_color_rcl(struct vc4_context *vc4,
-                        struct vc4_surface *dst_surf,
-                        struct vc4_surface *src_surf)
-{
-        struct vc4_resource *src = vc4_resource(src_surf->base.texture);
-        struct vc4_resource *dst = vc4_resource(dst_surf->base.texture);
-
-        uint32_t min_x_tile = 0;
-        uint32_t min_y_tile = 0;
-        uint32_t max_x_tile = (dst_surf->base.width - 1) / 64;
-        uint32_t max_y_tile = (dst_surf->base.height - 1) / 64;
-        uint32_t xtiles = max_x_tile - min_x_tile + 1;
-        uint32_t ytiles = max_y_tile - min_y_tile + 1;
-        uint32_t reloc_size = 9;
-        uint32_t config_size = 11 + reloc_size;
-        uint32_t loadstore_size = 7 + reloc_size;
-        uint32_t tilecoords_size = 3;
-        cl_ensure_space(&vc4->rcl,
-                        config_size +
-                        xtiles * ytiles * (loadstore_size * 2 +
-                                           tilecoords_size * 1));
-        cl_ensure_space(&vc4->bo_handles, 2 * sizeof(uint32_t));
-        cl_ensure_space(&vc4->bo_pointers, 2 * sizeof(struct vc4_bo *));
-
-        cl_start_reloc(&vc4->rcl, 1);
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->rcl, dst->bo, dst_surf->offset);
-        cl_u16(&vc4->rcl, dst_surf->base.width);
-        cl_u16(&vc4->rcl, dst_surf->base.height);
-        cl_u16(&vc4->rcl, ((dst_surf->tiling <<
-                            VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) |
-                           (vc4_rt_format_is_565(dst_surf->base.format) ?
-                            VC4_RENDER_CONFIG_FORMAT_BGR565 :
-                            VC4_RENDER_CONFIG_FORMAT_RGBA8888)));
-
-        uint32_t src_hindex = vc4_gem_hindex(vc4, src->bo);
-
-        for (int y = min_y_tile; y <= max_y_tile; y++) {
-                for (int x = min_x_tile; x <= max_x_tile; x++) {
-                        bool end_of_frame = (x == max_x_tile &&
-                                             y == max_y_tile);
-
-                        cl_start_reloc(&vc4->rcl, 1);
-                        cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                        cl_u8(&vc4->rcl,
-                              VC4_LOADSTORE_TILE_BUFFER_COLOR |
-                              (src_surf->tiling <<
-                               VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                        cl_u8(&vc4->rcl,
-                              vc4_rt_format_is_565(src_surf->base.format) ?
-                              VC4_LOADSTORE_TILE_BUFFER_BGR565 :
-                              VC4_LOADSTORE_TILE_BUFFER_RGBA8888);
-                        cl_reloc_hindex(&vc4->rcl, src_hindex,
-                                        src_surf->offset);
-
-                        cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-                        cl_u8(&vc4->rcl, x);
-                        cl_u8(&vc4->rcl, y);
-
-                        if (end_of_frame) {
-                                cl_u8(&vc4->rcl,
-                                      VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
-                        } else {
-                                cl_u8(&vc4->rcl,
-                                      VC4_PACKET_STORE_MS_TILE_BUFFER);
-                        }
-                }
-        }
-
-        vc4->draw_min_x = 0;
-        vc4->draw_min_y = 0;
-        vc4->draw_max_x = dst_surf->base.width;
-        vc4->draw_max_y = dst_surf->base.height;
-
-        dst->writes++;
-        vc4->needs_flush = true;
-}
-
-static struct vc4_surface *
+static struct pipe_surface *
 vc4_get_blit_surface(struct pipe_context *pctx,
                      struct pipe_resource *prsc, unsigned level)
 {
@@ -117,7 +38,7 @@ vc4_get_blit_surface(struct pipe_context *pctx,
         tmpl.u.tex.first_layer = 0;
         tmpl.u.tex.last_layer = 0;
 
-        return vc4_surface(pctx->create_surface(pctx, prsc, &tmpl));
+        return pctx->create_surface(pctx, prsc, &tmpl);
 }
 
 static bool
@@ -141,17 +62,28 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
         if (info->dst.resource->format != info->src.resource->format)
                 return false;
 
-        struct vc4_surface *dst_surf =
+        vc4_flush(pctx);
+
+        struct pipe_surface *dst_surf =
                 vc4_get_blit_surface(pctx, info->dst.resource, info->dst.level);
-        struct vc4_surface *src_surf =
+        struct pipe_surface *src_surf =
                 vc4_get_blit_surface(pctx, info->src.resource, info->src.level);
 
-        vc4_flush(pctx);
-        vc4_tile_blit_color_rcl(vc4, dst_surf, src_surf);
+        pipe_surface_reference(&vc4->color_read, src_surf);
+        pipe_surface_reference(&vc4->color_write, dst_surf);
+        pipe_surface_reference(&vc4->zs_read, NULL);
+        pipe_surface_reference(&vc4->zs_write, NULL);
+        vc4->draw_min_x = 0;
+        vc4->draw_min_y = 0;
+        vc4->draw_max_x = dst_surf->width;
+        vc4->draw_max_y = dst_surf->height;
+        vc4->draw_width = dst_surf->width;
+        vc4->draw_height = dst_surf->height;
+        vc4->needs_flush = true;
         vc4_job_submit(vc4);
 
-        pctx->surface_destroy(pctx, &dst_surf->base);
-        pctx->surface_destroy(pctx, &src_surf->base);
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_surface_reference(&src_surf, NULL);
 
         return true;
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 4bb2c711e16..cbdb9e89cf6 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -34,8 +34,46 @@
 #include "vc4_context.h"
 #include "vc4_screen.h"
 
-#define container_of(ptr, type, field) \
-   (type*)((char*)ptr - offsetof(type, field))
+static bool dump_stats = false;
+
+static void
+vc4_bo_dump_stats(struct vc4_screen *screen)
+{
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+
+        fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
+        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 102);
+        fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 102);
+
+        if (!list_empty(&cache->time_list)) {
+                struct vc4_bo *first = LIST_ENTRY(struct vc4_bo,
+                                                  cache->time_list.next,
+                                                  time_list);
+                struct vc4_bo *last = LIST_ENTRY(struct vc4_bo,
+                                                  cache->time_list.prev,
+                                                  time_list);
+
+                fprintf(stderr, "  oldest cache time: %ld\n",
+                        (long)first->free_time);
+                fprintf(stderr, "  newest cache time: %ld\n",
+                        (long)last->free_time);
+
+                struct timespec time;
+                clock_gettime(CLOCK_MONOTONIC, &time);
+                fprintf(stderr, "  now:               %ld\n",
+                        time.tv_sec);
+        }
+}
+
+static void
+vc4_bo_remove_from_cache(struct vc4_bo_cache *cache, struct vc4_bo *bo)
+{
+        list_del(&bo->time_list);
+        list_del(&bo->size_list);
+        cache->bo_count--;
+        cache->bo_size -= bo->size;
+}
 
 static struct vc4_bo *
 vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
@@ -48,12 +86,21 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
 
         struct vc4_bo *bo = NULL;
         pipe_mutex_lock(cache->lock);
-        if (!is_empty_list(&cache->size_list[page_index])) {
-                struct simple_node *node = last_elem(&cache->size_list[page_index]);
-                bo = container_of(node, struct vc4_bo, size_list);
+        if (!list_empty(&cache->size_list[page_index])) {
+                bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next,
+                                size_list);
+
+                /* Check that the BO has gone idle.  If not, then we want to
+                 * allocate something new instead, since we assume that the
+                 * user will proceed to CPU map it and fill it with stuff.
+                 */
+                if (!vc4_bo_wait(bo, 0)) {
+                        pipe_mutex_unlock(cache->lock);
+                        return NULL;
+                }
+
                 pipe_reference_init(&bo->reference, 1);
-                remove_from_list(&bo->time_list);
-                remove_from_list(&bo->size_list);
+                vc4_bo_remove_from_cache(cache, bo);
 
                 bo->name = name;
         }
@@ -70,8 +117,14 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         size = align(size, 4096);
 
         bo = vc4_bo_from_cache(screen, size, name);
-        if (bo)
+        if (bo) {
+                if (dump_stats) {
+                        fprintf(stderr, "Allocated %s %dkb from cache:\n",
+                                name, size / 1024);
+                        vc4_bo_dump_stats(screen);
+                }
                 return bo;
+        }
 
         bo = CALLOC_STRUCT(vc4_bo);
         if (!bo)
@@ -108,6 +161,13 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
                 abort();
         }
 
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated %s %dkb:\n", name, size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+
         return bo;
 }
 
@@ -145,26 +205,47 @@ vc4_bo_free(struct vc4_bo *bo)
         if (ret != 0)
                 fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
 
+        screen->bo_count--;
+        screen->bo_size -= bo->size;
+
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s%s%dkb:\n",
+                        bo->name ? bo->name : "",
+                        bo->name ? " " : "",
+                        bo->size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+
         free(bo);
 }
 
 static void
 free_stale_bos(struct vc4_screen *screen, time_t time)
 {
-        while (!is_empty_list(&screen->bo_cache.time_list)) {
-                struct simple_node *node =
-                        first_elem(&screen->bo_cache.time_list);
-                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+        bool freed_any = false;
+
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                if (dump_stats && !freed_any) {
+                        fprintf(stderr, "Freeing stale BOs:\n");
+                        vc4_bo_dump_stats(screen);
+                        freed_any = true;
+                }
 
                 /* If it's more than a second old, free it. */
                 if (time - bo->free_time > 2) {
-                        remove_from_list(&bo->time_list);
-                        remove_from_list(&bo->size_list);
+                        vc4_bo_remove_from_cache(cache, bo);
                         vc4_bo_free(bo);
                 } else {
                         break;
                 }
         }
+
+        if (dump_stats && freed_any) {
+                fprintf(stderr, "Freed stale BOs:\n");
+                vc4_bo_dump_stats(screen);
+        }
 }
 
 void
@@ -180,16 +261,16 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
         }
 
         if (cache->size_list_size <= page_index) {
-                struct simple_node *new_list =
-                        ralloc_array(screen, struct simple_node, page_index + 1);
+                struct list_head *new_list =
+                        ralloc_array(screen, struct list_head, page_index + 1);
 
                 /* Move old list contents over (since the array has moved, and
-                 * therefore the pointers to the list heads have to change.
+                 * therefore the pointers to the list heads have to change).
                  */
                 for (int i = 0; i < cache->size_list_size; i++) {
-                        struct simple_node *old_head = &cache->size_list[i];
-                        if (is_empty_list(old_head))
-                                make_empty_list(&new_list[i]);
+                        struct list_head *old_head = &cache->size_list[i];
+                        if (list_empty(old_head))
+                                list_inithead(&new_list[i]);
                         else {
                                 new_list[i].next = old_head->next;
                                 new_list[i].prev = old_head->prev;
@@ -198,15 +279,23 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
                         }
                 }
                 for (int i = cache->size_list_size; i < page_index + 1; i++)
-                        make_empty_list(&new_list[i]);
+                        list_inithead(&new_list[i]);
 
                 cache->size_list = new_list;
                 cache->size_list_size = page_index + 1;
         }
 
         bo->free_time = time;
-        insert_at_tail(&cache->size_list[page_index], &bo->size_list);
-        insert_at_tail(&cache->time_list, &bo->time_list);
+        list_addtail(&bo->size_list, &cache->size_list[page_index]);
+        list_addtail(&bo->time_list, &cache->time_list);
+        cache->bo_count++;
+        cache->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s %dkb to cache:\n",
+                        bo->name, bo->size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+        bo->name = NULL;
 
         free_stale_bos(screen, time);
 }
@@ -286,6 +375,7 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
                         bo->handle);
                 return -1;
         }
+        bo->private = false;
 
         return fd;
 }
@@ -342,15 +432,17 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
                 ret = 0;
         }
 
-        if (ret == -ETIME) {
-                return false;
-        } else if (ret != 0) {
-                fprintf(stderr, "wait failed\n");
-                abort();
-        } else {
+        if (ret == 0) {
                 screen->finished_seqno = wait.seqno;
                 return true;
         }
+
+        if (errno != ETIME) {
+                fprintf(stderr, "wait failed: %d\n", ret);
+                abort();
+        }
+
+        return false;
 }
 
 bool
@@ -369,14 +461,15 @@ vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
         else
                 ret = 0;
 
-        if (ret == -ETIME) {
-                return false;
-        } else if (ret != 0) {
-                fprintf(stderr, "wait failed\n");
-                abort();
-        } else {
+        if (ret == 0)
                 return true;
+
+        if (errno != ETIME) {
+                fprintf(stderr, "wait failed: %d\n", ret);
+                abort();
         }
+
+        return false;
 }
 
 void *
@@ -437,12 +530,14 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen)
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
-        while (!is_empty_list(&cache->time_list)) {
-                struct simple_node *node = first_elem(&cache->time_list);
-                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
-
-                remove_from_list(&bo->time_list);
-                remove_from_list(&bo->size_list);
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                vc4_bo_remove_from_cache(cache, bo);
                 vc4_bo_free(bo);
         }
+
+        if (dump_stats) {
+                fprintf(stderr, "BO stats after screen destroy:\n");
+                vc4_bo_dump_stats(screen);
+        }
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index f9559e999a1..7320695ca8e 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -44,9 +44,9 @@ struct vc4_bo {
 #endif
 
         /** Entry in the linked list of buffers freed, by age. */
-        struct simple_node time_list;
+        struct list_head time_list;
         /** Entry in the per-page-count linked list of buffers freed (by age). */
-        struct simple_node size_list;
+        struct list_head size_list;
         /** Approximate second when the bo was freed. */
         time_t free_time;
         /**
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 32a2e717379..4a50e790942 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -29,7 +29,7 @@
 #include "util/u_math.h"
 #include "util/macros.h"
 
-#include "vc4_packet.h"
+#include "kernel/vc4_packet.h"
 
 struct vc4_bo;
 
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 14239840d32..69055081daa 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -174,6 +174,37 @@ dump_VC4_PACKET_CLIPPER_Z_SCALING(void *cl, uint32_t offset, uint32_t hw_offset)
 }
 
 static void
+dump_VC4_PACKET_TILE_BINNING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t *tile_alloc_addr = cl + offset;
+        uint32_t *tile_alloc_size = cl + offset + 4;
+        uint32_t *tile_state_addr = cl + offset + 8;
+        uint8_t *bin_x = cl + offset + 12;
+        uint8_t *bin_y = cl + offset + 13;
+        uint8_t *flags = cl + offset + 14;
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile alloc addr 0x%08x\n",
+                offset, hw_offset,
+                *tile_alloc_addr);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile alloc size %db\n",
+                offset + 4, hw_offset + 4,
+                *tile_alloc_size);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile state addr 0x%08x\n",
+                offset + 8, hw_offset + 8,
+                *tile_state_addr);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tiles (%d, %d)\n",
+                offset + 12, hw_offset + 12,
+                *bin_x, *bin_y);
+
+        fprintf(stderr, "0x%08x 0x%08x:       flags 0x%02x\n",
+                offset + 14, hw_offset + 14,
+                *flags);
+}
+
+static void
 dump_VC4_PACKET_TILE_RENDERING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *render_offset = cl + offset;
@@ -311,7 +342,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
         PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
 
-        PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
         PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
         PACKET(VC4_PACKET_CLEAR_COLORS, 14),
         PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index b394c186efb..630f8e68896 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -29,6 +29,7 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
+#include "util/u_upload_mgr.h"
 #include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
@@ -36,270 +37,12 @@
 #include "vc4_context.h"
 #include "vc4_resource.h"
 
-/**
- * Emits a no-op STORE_TILE_BUFFER_GENERAL.
- *
- * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
- * some sort before another load is triggered.
- */
-static void
-vc4_store_before_load(struct vc4_context *vc4, bool *coords_emitted)
-{
-        if (!*coords_emitted)
-                return;
-
-        cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-        cl_u8(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
-        cl_u8(&vc4->rcl, (VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
-                          VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
-                          VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR));
-        cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
-
-        *coords_emitted = false;
-}
-
-/**
- * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
- *
- * The tile coordinates packet triggers a pending load if there is one, are
- * used for clipping during rendering, and determine where loads/stores happen
- * relative to their base address.
- */
-static void
-vc4_tile_coordinates(struct vc4_context *vc4, uint32_t x, uint32_t y,
-                       bool *coords_emitted)
-{
-        if (*coords_emitted)
-                return;
-
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-        cl_u8(&vc4->rcl, x);
-        cl_u8(&vc4->rcl, y);
-
-        *coords_emitted = true;
-}
-
-static void
-vc4_setup_rcl(struct vc4_context *vc4)
-{
-        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
-        struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
-        struct vc4_surface *zsurf = vc4_surface(vc4->framebuffer.zsbuf);
-        struct vc4_resource *ztex = zsurf ? vc4_resource(zsurf->base.texture) : NULL;
-
-        if (!csurf)
-                vc4->resolve &= ~PIPE_CLEAR_COLOR0;
-        if (!zsurf)
-                vc4->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
-        uint32_t resolve_uncleared = vc4->resolve & ~vc4->cleared;
-        uint32_t width = vc4->framebuffer.width;
-        uint32_t height = vc4->framebuffer.height;
-        uint32_t stride_in_tiles = align(width, 64) / 64;
-
-        assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
-        uint32_t min_x_tile = vc4->draw_min_x / 64;
-        uint32_t min_y_tile = vc4->draw_min_y / 64;
-        uint32_t max_x_tile = (vc4->draw_max_x - 1) / 64;
-        uint32_t max_y_tile = (vc4->draw_max_y - 1) / 64;
-        uint32_t xtiles = max_x_tile - min_x_tile + 1;
-        uint32_t ytiles = max_y_tile - min_y_tile + 1;
-
-#if 0
-        fprintf(stderr, "RCL: resolve 0x%x clear 0x%x resolve uncleared 0x%x\n",
-                vc4->resolve,
-                vc4->cleared,
-                resolve_uncleared);
-#endif
-
-        uint32_t reloc_size = 9;
-        uint32_t clear_size = 14;
-        uint32_t config_size = 11 + reloc_size;
-        uint32_t loadstore_size = 7 + reloc_size;
-        uint32_t tilecoords_size = 3;
-        uint32_t branch_size = 5 + reloc_size;
-        uint32_t color_store_size = 1;
-        uint32_t semaphore_size = 1;
-        cl_ensure_space(&vc4->rcl,
-                        clear_size +
-                        config_size +
-                        loadstore_size +
-                        semaphore_size +
-                        xtiles * ytiles * (loadstore_size * 4 +
-                                           tilecoords_size * 3 +
-                                           branch_size +
-                                           color_store_size));
-
-        if (vc4->cleared) {
-                cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
-                cl_u32(&vc4->rcl, vc4->clear_color[0]);
-                cl_u32(&vc4->rcl, vc4->clear_color[1]);
-                cl_u32(&vc4->rcl, vc4->clear_depth);
-                cl_u8(&vc4->rcl, vc4->clear_stencil);
-        }
-
-        /* The rendering mode config determines the pointer that's used for
-         * VC4_PACKET_STORE_MS_TILE_BUFFER address computations.  The kernel
-         * could handle a no-relocation rendering mode config and deny those
-         * packets, but instead we just tell the kernel we're doing our color
-         * rendering to the Z buffer, and just don't emit any of those
-         * packets.
-         */
-        struct vc4_surface *render_surf = csurf ? csurf : zsurf;
-        struct vc4_resource *render_tex = vc4_resource(render_surf->base.texture);
-        cl_start_reloc(&vc4->rcl, 1);
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->rcl, render_tex->bo, render_surf->offset);
-        cl_u16(&vc4->rcl, width);
-        cl_u16(&vc4->rcl, height);
-        cl_u16(&vc4->rcl, ((render_surf->tiling <<
-                            VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) |
-                           (vc4_rt_format_is_565(render_surf->base.format) ?
-                            VC4_RENDER_CONFIG_FORMAT_BGR565 :
-                            VC4_RENDER_CONFIG_FORMAT_RGBA8888)));
-
-        /* The tile buffer normally gets cleared when the previous tile is
-         * stored.  If the clear values changed between frames, then the tile
-         * buffer has stale clear values in it, so we have to do a store in
-         * None mode (no writes) so that we trigger the tile buffer clear.
-         *
-         * Excess clearing is only a performance cost, since per-tile contents
-         * will be loaded/stored in the loop below.
-         */
-        if (vc4->cleared & (PIPE_CLEAR_COLOR0 |
-                            PIPE_CLEAR_DEPTH |
-                            PIPE_CLEAR_STENCIL)) {
-                cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-                cl_u8(&vc4->rcl, 0);
-                cl_u8(&vc4->rcl, 0);
-
-                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-                cl_u16(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
-                cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
-        }
-
-        uint32_t color_hindex = ctex ? vc4_gem_hindex(vc4, ctex->bo) : 0;
-        uint32_t depth_hindex = ztex ? vc4_gem_hindex(vc4, ztex->bo) : 0;
-        uint32_t tile_alloc_hindex = vc4_gem_hindex(vc4, vc4->tile_alloc);
-
-        for (int y = min_y_tile; y <= max_y_tile; y++) {
-                for (int x = min_x_tile; x <= max_x_tile; x++) {
-                        bool end_of_frame = (x == max_x_tile &&
-                                             y == max_y_tile);
-                        bool coords_emitted = false;
-
-                        /* Note that the load doesn't actually occur until the
-                         * tile coords packet is processed, and only one load
-                         * may be outstanding at a time.
-                         */
-                        if (resolve_uncleared & PIPE_CLEAR_COLOR) {
-                                vc4_store_before_load(vc4, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_COLOR |
-                                      (csurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl,
-                                      vc4_rt_format_is_565(csurf->base.format) ?
-                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
-                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888);
-                                cl_reloc_hindex(&vc4->rcl, color_hindex,
-                                                csurf->offset);
-
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                        }
-
-                        if (resolve_uncleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-                                vc4_store_before_load(vc4, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
-                                      (zsurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl, 0);
-                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
-                                                zsurf->offset);
-
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                        }
-
-                        /* Clipping depends on tile coordinates having been
-                         * emitted, so make sure it's happened even if
-                         * everything was cleared to start.
-                         */
-                        vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-
-                        /* Wait for the binner before jumping to the first
-                         * tile's lists.
-                         */
-                        if (x == min_x_tile && y == min_y_tile)
-                                cl_u8(&vc4->rcl, VC4_PACKET_WAIT_ON_SEMAPHORE);
-
-                        cl_start_reloc(&vc4->rcl, 1);
-                        cl_u8(&vc4->rcl, VC4_PACKET_BRANCH_TO_SUB_LIST);
-                        cl_reloc_hindex(&vc4->rcl, tile_alloc_hindex,
-                                        (y * stride_in_tiles + x) * 32);
-
-                        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
-                                      (zsurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl,
-                                      VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR);
-                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
-                                                zsurf->offset |
-                                                ((end_of_frame &&
-                                                  !(vc4->resolve & PIPE_CLEAR_COLOR0)) ?
-                                                 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
-
-                                coords_emitted = false;
-                        }
-
-                        if (vc4->resolve & PIPE_CLEAR_COLOR0) {
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                                if (end_of_frame) {
-                                        cl_u8(&vc4->rcl,
-                                              VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
-                                } else {
-                                        cl_u8(&vc4->rcl,
-                                              VC4_PACKET_STORE_MS_TILE_BUFFER);
-                                }
-
-                                coords_emitted = false;
-                        }
-
-                        /* One of the bits needs to have been set that would
-                         * have triggered an EOF.
-                         */
-                        assert(vc4->resolve & (PIPE_CLEAR_COLOR0 |
-                                               PIPE_CLEAR_DEPTH |
-                                               PIPE_CLEAR_STENCIL));
-                        /* Any coords emitted must also have been consumed by
-                         * a store.
-                         */
-                        assert(!coords_emitted);
-                }
-        }
-
-        if (vc4->resolve & PIPE_CLEAR_COLOR0)
-                ctex->writes++;
-
-        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))
-                ztex->writes++;
-}
-
 void
 vc4_flush(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
+        struct pipe_surface *cbuf = vc4->framebuffer.cbufs[0];
+        struct pipe_surface *zsbuf = vc4->framebuffer.zsbuf;
 
         if (!vc4->needs_flush)
                 return;
@@ -322,7 +65,31 @@ vc4_flush(struct pipe_context *pctx)
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
         cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
 
-        vc4_setup_rcl(vc4);
+        if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
+                pipe_surface_reference(&vc4->color_write, cbuf);
+                if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
+                        pipe_surface_reference(&vc4->color_read, cbuf);
+                } else {
+                        pipe_surface_reference(&vc4->color_read, NULL);
+                }
+
+        } else {
+                pipe_surface_reference(&vc4->color_write, NULL);
+                pipe_surface_reference(&vc4->color_read, NULL);
+        }
+
+        if (vc4->framebuffer.zsbuf &&
+            (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
+                pipe_surface_reference(&vc4->zs_write, zsbuf);
+                if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
+                        pipe_surface_reference(&vc4->zs_read, zsbuf);
+                } else {
+                        pipe_surface_reference(&vc4->zs_read, NULL);
+                }
+        } else {
+                pipe_surface_reference(&vc4->zs_write, NULL);
+                pipe_surface_reference(&vc4->zs_read, NULL);
+        }
 
         vc4_job_submit(vc4);
 }
@@ -410,12 +177,13 @@ vc4_context_destroy(struct pipe_context *pctx)
         if (vc4->primconvert)
                 util_primconvert_destroy(vc4->primconvert);
 
+        if (vc4->uploader)
+                u_upload_destroy(vc4->uploader);
+
         util_slab_destroy(&vc4->transfer_pool);
 
         pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL);
         pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL);
-        vc4_bo_unreference(&vc4->tile_alloc);
-        vc4_bo_unreference(&vc4->tile_state);
 
         vc4_program_fini(pctx);
 
@@ -466,6 +234,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv)
         if (!vc4->primconvert)
                 goto fail;
 
+        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
+                                        PIPE_BIND_INDEX_BUFFER);
+
         vc4_debug |= saved_shaderdb_flag;
 
         return &vc4->base;
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d89f1974e12..d5d6be16f6e 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -178,12 +178,18 @@ struct vc4_context {
         struct vc4_screen *screen;
 
         struct vc4_cl bcl;
-        struct vc4_cl rcl;
         struct vc4_cl shader_rec;
         struct vc4_cl uniforms;
         struct vc4_cl bo_handles;
         struct vc4_cl bo_pointers;
         uint32_t shader_rec_count;
+
+        /** @{ Surfaces to submit rendering for. */
+        struct pipe_surface *color_read;
+        struct pipe_surface *color_write;
+        struct pipe_surface *zs_read;
+        struct pipe_surface *zs_write;
+        /** @} */
         /** @{
          * Bounding box of the scissor across all queued drawing.
          *
@@ -194,9 +200,13 @@ struct vc4_context {
         uint32_t draw_max_x;
         uint32_t draw_max_y;
         /** @} */
-
-        struct vc4_bo *tile_alloc;
-        struct vc4_bo *tile_state;
+        /** @{
+         * Width/height of the color framebuffer being rendered to,
+         * for VC4_TILE_RENDERING_MODE_CONFIG.
+        */
+        uint32_t draw_width;
+        uint32_t draw_height;
+        /** @} */
 
         struct util_slab_mempool transfer_pool;
         struct blitter_context *blitter;
@@ -243,6 +253,8 @@ struct vc4_context {
         /** Seqno of the last CL flush's job. */
         uint64_t last_emit_seqno;
 
+        struct u_upload_mgr *uploader;
+
         /** @{ Current pipeline state objects */
         struct pipe_scissor_state scissor;
         struct pipe_blend_state *blend;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 16418bf12da..5e6d70d6f33 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -72,44 +72,15 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
 
-        /* Tile alloc memory setup: We use an initial alloc size of 32b.  The
-         * hardware then aligns that to 256b (we use 4096, because all of our
-         * BO allocations align to that anyway), then for some reason the
-         * simulator wants an extra page available, even if you have overflow
-         * memory set up.
-         *
-         * XXX: The binner only does 28-bit addressing math, so the tile alloc
-         * and tile state should be in the same BO and that BO needs to not
-         * cross a 256MB boundary, somehow.
-         */
-        uint32_t tile_alloc_size = 32 * tilew * tileh;
-        tile_alloc_size = align(tile_alloc_size, 4096);
-        tile_alloc_size += 4096;
-        uint32_t tile_state_size = 48 * tilew * tileh;
-        if (!vc4->tile_alloc || vc4->tile_alloc->size < tile_alloc_size) {
-                vc4_bo_unreference(&vc4->tile_alloc);
-                vc4->tile_alloc = vc4_bo_alloc(vc4->screen, tile_alloc_size,
-                                               "tile_alloc");
-        }
-        if (!vc4->tile_state || vc4->tile_state->size < tile_state_size) {
-                vc4_bo_unreference(&vc4->tile_state);
-                vc4->tile_state = vc4_bo_alloc(vc4->screen, tile_state_size,
-                                               "tile_state");
-        }
-
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_start_reloc(&vc4->bcl, 2);
         cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->bcl, vc4->tile_alloc, 0);
-        cl_u32(&vc4->bcl, vc4->tile_alloc->size);
-        cl_reloc(vc4, &vc4->bcl, vc4->tile_state, 0);
+        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
         cl_u8(&vc4->bcl, tilew);
         cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl,
-              VC4_BIN_CONFIG_AUTO_INIT_TSDA |
-              VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 |
-              VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32);
+        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
@@ -129,6 +100,8 @@ vc4_start_draw(struct vc4_context *vc4)
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
+        vc4->draw_width = width;
+        vc4->draw_height = height;
 }
 
 static void
@@ -266,13 +239,17 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
          * definitions, up to but not including QUADS.
          */
         if (info->indexed) {
-                struct vc4_resource *rsc = vc4_resource(vc4->indexbuf.buffer);
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
-                if (rsc->shadow_parent) {
-                        vc4_update_shadow_index_buffer(pctx, &vc4->indexbuf);
-                        offset = 0;
+                struct pipe_resource *prsc;
+                if (vc4->indexbuf.index_size == 4) {
+                        prsc = vc4_get_shadow_index_buffer(pctx, &vc4->indexbuf,
+                                                           info->count, &offset);
+                        index_size = 2;
+                } else {
+                        prsc = vc4->indexbuf.buffer;
                 }
+                struct vc4_resource *rsc = vc4_resource(prsc);
 
                 cl_start_reloc(&vc4->bcl, 1);
                 cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
@@ -284,6 +261,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 cl_u32(&vc4->bcl, info->count);
                 cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
                 cl_u32(&vc4->bcl, max_index);
+
+                if (vc4->indexbuf.index_size == 4)
+                        pipe_resource_reference(&prsc, NULL);
         } else {
                 cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
                 cl_u8(&vc4->bcl, info->mode);
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 062fd3b687e..5f1ee4fa125 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -38,6 +38,15 @@
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 
+struct drm_vc4_submit_rcl_surface {
+	uint32_t hindex; /* Handle index, or ~0 if not present. */
+	uint32_t offset; /* Offset to start of buffer. */
+	/*
+         * Bits for either render config (color_ms_write) or load/store packet.
+	 */
+	uint16_t bits;
+	uint16_t pad;
+};
 
 /**
  * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
@@ -62,16 +71,6 @@ struct drm_vc4_submit_cl {
 	 */
 	uint64_t bin_cl;
 
-	/* Pointer to the render command list.
-	 *
-	 * The render command list contains a set of packets to load the
-	 * current tile's state (reading from memory, or just clearing it)
-	 * into the GPU, then call into the tile allocation BO to run the
-	 * stored rendering for that tile, then store the tile's state back to
-	 * memory.
-	 */
-	uint64_t render_cl;
-
 	/* Pointer to the shader records.
 	 *
 	 * Shader records are the structures read by the hardware that contain
@@ -102,8 +101,6 @@ struct drm_vc4_submit_cl {
 
 	/* Size in bytes of the binner command list. */
 	uint32_t bin_cl_size;
-	/* Size in bytes of the render command list */
-	uint32_t render_cl_size;
 	/* Size in bytes of the set of shader records. */
 	uint32_t shader_rec_size;
 	/* Number of shader records.
@@ -119,8 +116,25 @@ struct drm_vc4_submit_cl {
 	/* Number of BO handles passed in (size is that times 4). */
 	uint32_t bo_handle_count;
 
+	/* RCL setup: */
+	uint16_t width;
+	uint16_t height;
+	uint8_t min_x_tile;
+	uint8_t min_y_tile;
+	uint8_t max_x_tile;
+	uint8_t max_y_tile;
+	struct drm_vc4_submit_rcl_surface color_read;
+	struct drm_vc4_submit_rcl_surface color_ms_write;
+	struct drm_vc4_submit_rcl_surface zs_read;
+	struct drm_vc4_submit_rcl_surface zs_write;
+	uint32_t clear_color[2];
+	uint32_t clear_z;
+	uint8_t clear_s;
+
+	uint32_t pad:24;
+
+#define VC4_SUBMIT_CL_USE_CLEAR_COLOR			(1 << 0)
 	uint32_t flags;
-	uint32_t pad;
 
 	/* Returned value of the seqno of this render job (for the
 	 * wait ioctl).
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 76037162102..dcade15443a 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -33,7 +33,6 @@ void
 vc4_job_init(struct vc4_context *vc4)
 {
         vc4_init_cl(vc4, &vc4->bcl);
-        vc4_init_cl(vc4, &vc4->rcl);
         vc4_init_cl(vc4, &vc4->shader_rec);
         vc4_init_cl(vc4, &vc4->uniforms);
         vc4_init_cl(vc4, &vc4->bo_handles);
@@ -50,7 +49,6 @@ vc4_job_reset(struct vc4_context *vc4)
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
-        vc4_reset_cl(&vc4->rcl);
         vc4_reset_cl(&vc4->shader_rec);
         vc4_reset_cl(&vc4->uniforms);
         vc4_reset_cl(&vc4->bo_handles);
@@ -75,6 +73,70 @@ vc4_job_reset(struct vc4_context *vc4)
         vc4->draw_max_y = 0;
 }
 
+static void
+vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
+                             struct drm_vc4_submit_rcl_surface *submit_surf,
+                             struct pipe_surface *psurf,
+                             bool is_depth, bool is_write)
+{
+        struct vc4_surface *surf = vc4_surface(psurf);
+
+        if (!surf) {
+                submit_surf->hindex = ~0;
+                return;
+        }
+
+        struct vc4_resource *rsc = vc4_resource(psurf->texture);
+        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
+        submit_surf->offset = surf->offset;
+
+        if (is_depth) {
+                submit_surf->bits =
+                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
+                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+
+        } else {
+                submit_surf->bits =
+                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR,
+                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+                        VC4_SET_FIELD(vc4_rt_format_is_565(psurf->format) ?
+                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
+                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888,
+                                      VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+        }
+        submit_surf->bits |=
+                VC4_SET_FIELD(surf->tiling, VC4_LOADSTORE_TILE_BUFFER_TILING);
+
+        if (is_write)
+                rsc->writes++;
+}
+
+static void
+vc4_submit_setup_ms_rcl_surface(struct vc4_context *vc4,
+                                struct drm_vc4_submit_rcl_surface *submit_surf,
+                                struct pipe_surface *psurf)
+{
+        struct vc4_surface *surf = vc4_surface(psurf);
+
+        if (!surf) {
+                submit_surf->hindex = ~0;
+                return;
+        }
+
+        struct vc4_resource *rsc = vc4_resource(psurf->texture);
+        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
+        submit_surf->offset = surf->offset;
+
+        submit_surf->bits =
+                VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
+                              VC4_RENDER_CONFIG_FORMAT_BGR565 :
+                              VC4_RENDER_CONFIG_FORMAT_RGBA8888,
+                              VC4_RENDER_CONFIG_FORMAT) |
+                VC4_SET_FIELD(surf->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
+
+        rsc->writes++;
+}
+
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
@@ -84,26 +146,49 @@ vc4_job_submit(struct vc4_context *vc4)
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
                 vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
-                fprintf(stderr, "RCL:\n");
-                vc4_dump_cl(vc4->rcl.base, vc4->rcl.next - vc4->rcl.base, true);
         }
 
         struct drm_vc4_submit_cl submit;
         memset(&submit, 0, sizeof(submit));
 
+        cl_ensure_space(&vc4->bo_handles, 4 * sizeof(uint32_t));
+        cl_ensure_space(&vc4->bo_pointers, 4 * sizeof(struct vc4_bo *));
+
+        vc4_submit_setup_rcl_surface(vc4, &submit.color_read,
+                                     vc4->color_read, false, false);
+        vc4_submit_setup_ms_rcl_surface(vc4, &submit.color_ms_write,
+                                        vc4->color_write);
+        vc4_submit_setup_rcl_surface(vc4, &submit.zs_read,
+                                     vc4->zs_read, true, false);
+        vc4_submit_setup_rcl_surface(vc4, &submit.zs_write,
+                                     vc4->zs_write, true, true);
+
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
         submit.bo_handle_count = (vc4->bo_handles.next -
                                   vc4->bo_handles.base) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
         submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
-        submit.render_cl = (uintptr_t)vc4->rcl.base;
-        submit.render_cl_size = vc4->rcl.next - vc4->rcl.base;
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
         submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
         submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
 
+        assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
+        submit.min_x_tile = vc4->draw_min_x / 64;
+        submit.min_y_tile = vc4->draw_min_y / 64;
+        submit.max_x_tile = (vc4->draw_max_x - 1) / 64;
+        submit.max_y_tile = (vc4->draw_max_y - 1) / 64;
+        submit.width = vc4->draw_width;
+        submit.height = vc4->draw_height;
+        if (vc4->cleared) {
+                submit.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR;
+                submit.clear_color[0] = vc4->clear_color[0];
+                submit.clear_color[1] = vc4->clear_color[1];
+                submit.clear_z = vc4->clear_depth;
+                submit.clear_s = vc4->clear_stencil;
+        }
+
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index e40e0f3b71b..7978ea1829f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -136,11 +136,8 @@ bool
 qir_opt_algebraic(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
                 case QOP_SEL_X_Y_ZS:
                 case QOP_SEL_X_Y_ZC:
diff --git a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
index ac9be5c9642..15ec9f07260 100644
--- a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
+++ b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
@@ -98,10 +98,8 @@ bool
 qir_opt_constant_folding(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (constant_fold(c, inst))
                         progress = true;
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index 5189a401248..d6d2fbf257f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -38,13 +38,10 @@ bool
 qir_opt_copy_propagation(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
         bool debug = false;
         struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg));
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         int index = inst->src[i].index;
                         if (inst->src[i].file == QFILE_TEMP &&
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 71794f7d1cf..92c8260eb59 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -121,7 +121,6 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node, *t;
         uint32_t sf_count = 0, r4_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
@@ -129,9 +128,7 @@ qir_opt_cse(struct vc4_compile *c)
         if (!ht)
                 return false;
 
-        foreach_s(node, t, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
                     qir_has_side_effect_reads(c, inst)) {
                         continue;
diff --git a/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/src/gallium/drivers/vc4/vc4_opt_dead_code.c
index e4ead46c9c2..ffd42422de8 100644
--- a/src/gallium/drivers/vc4/vc4_opt_dead_code.c
+++ b/src/gallium/drivers/vc4/vc4_opt_dead_code.c
@@ -86,7 +86,7 @@ qir_opt_dead_code(struct vc4_compile *c)
         /* Whether we're eliminating texture setup currently. */
         bool dce_tex = false;
 
-        struct simple_node *node, *t;
+        struct list_head *node, *t;
         for (node = c->instructions.prev, t = node->prev;
              &c->instructions != node;
              node = t, t = t->prev) {
diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index a329ac69d11..d6e98f0aebf 100644
--- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -37,11 +37,8 @@ bool
 qir_opt_small_immediates(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 /* The small immediate value sits in the raddr B field, so we
                  * can't have 2 small immediates in one instruction (unless
                  * they're the same value, but that should be optimized away
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index e9711f222cd..e04f02859d5 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -37,15 +37,12 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                 return false;
 
         bool progress = false;
-        struct simple_node *node;
         struct qinst *vpm_writes[64] = { 0 };
         uint32_t use_count[c->num_temps];
         uint32_t vpm_write_count = 0;
         memset(&use_count, 0, sizeof(use_count));
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->dst.file) {
                 case QFILE_VPM:
                         vpm_writes[vpm_write_count++] = inst;
@@ -102,7 +99,8 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                  * to maintain the order of the VPM writes.
                  */
                 assert(!vpm_writes[i]->sf);
-                move_to_tail(&vpm_writes[i]->link, &inst->link);
+                list_del(&inst->link);
+                list_addtail(&inst->link, &vpm_writes[i]->link);
                 qir_remove_instruction(c, vpm_writes[i]);
 
                 c->defs[inst->dst.index] = NULL;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index bf156f9b42d..ba47c51d9bd 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -147,6 +147,9 @@ indirect_uniform_load(struct vc4_compile *c,
         indirect_offset = qir_ADD(c, indirect_offset,
                                   qir_uniform_ui(c, (range->dst_offset +
                                                      offset)));
+
+        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
+        indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
         indirect_offset = qir_MIN(c, indirect_offset,
                                   qir_uniform_ui(c, (range->dst_offset +
                                                      range->size - 4)));
@@ -322,7 +325,9 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 switch (instr->src[i].src_type) {
                 case nir_tex_src_coord:
                         s = ntq_get_src(c, instr->src[i].src, 0);
-                        if (instr->sampler_dim != GLSL_SAMPLER_DIM_1D)
+                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
+                                t = qir_uniform_f(c, 0.5);
+                        else
                                 t = ntq_get_src(c, instr->src[i].src, 1);
                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
                                 r = ntq_get_src(c, instr->src[i].src, 2);
@@ -1849,8 +1854,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++) {
                         dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
                                               instr->const_index[0] * 4 + i);
@@ -1858,8 +1861,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++) {
                         dest[i] = indirect_uniform_load(c,
                                                         ntq_get_src(c, instr->src[0], 0),
@@ -1870,8 +1871,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++)
                         dest[i] = c->inputs[instr->const_index[0] * 4 + i];
 
@@ -2215,11 +2214,9 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         shader->program_id = vc4->next_compiled_program_id++;
         if (stage == QSTAGE_FRAG) {
                 bool input_live[c->num_input_semantics];
-                struct simple_node *node;
 
                 memset(input_live, 0, sizeof(input_live));
-                foreach(node, &c->instructions) {
-                        struct qinst *inst = (struct qinst *)node;
+                list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                                 if (inst->src[i].file == QFILE_VARY)
                                         input_live[inst->src[i].index] = true;
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index e2e6a5cdf16..1c96ef4795f 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -22,7 +22,6 @@
  */
 
 #include "util/u_memory.h"
-#include "util/simple_list.h"
 #include "util/ralloc.h"
 
 #include "vc4_qir.h"
@@ -301,10 +300,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
 void
 qir_dump(struct vc4_compile *c)
 {
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 qir_dump_inst(c, inst);
                 fprintf(stderr, "\n");
         }
@@ -370,7 +366,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst)
         if (inst->dst.file == QFILE_TEMP)
                 c->defs[inst->dst.index] = inst;
 
-        insert_at_tail(&c->instructions, &inst->link);
+        list_addtail(&inst->link, &c->instructions);
 }
 
 bool
@@ -384,7 +380,7 @@ qir_compile_init(void)
 {
         struct vc4_compile *c = rzalloc(NULL, struct vc4_compile);
 
-        make_empty_list(&c->instructions);
+        list_inithead(&c->instructions);
 
         c->output_position_index = -1;
         c->output_clipvertex_index = -1;
@@ -403,7 +399,7 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
         if (qinst->dst.file == QFILE_TEMP)
                 c->defs[qinst->dst.index] = NULL;
 
-        remove_from_list(&qinst->link);
+        list_del(&qinst->link);
         free(qinst->src);
         free(qinst);
 }
@@ -420,9 +416,9 @@ qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 void
 qir_compile_destroy(struct vc4_compile *c)
 {
-        while (!is_empty_list(&c->instructions)) {
+        while (!list_empty(&c->instructions)) {
                 struct qinst *qinst =
-                        (struct qinst *)first_elem(&c->instructions);
+                        (struct qinst *)c->instructions.next;
                 qir_remove_instruction(c, qinst);
         }
 
@@ -478,7 +474,7 @@ void
 qir_SF(struct vc4_compile *c, struct qreg src)
 {
         struct qinst *last_inst = NULL;
-        if (!is_empty_list(&c->instructions))
+        if (!list_empty(&c->instructions))
                 last_inst = (struct qinst *)c->instructions.prev;
 
         if (!last_inst ||
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index adc2c89d2c1..732cfd0b306 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -33,7 +33,7 @@
 
 #include "util/macros.h"
 #include "glsl/nir/nir.h"
-#include "util/simple_list.h"
+#include "util/list.h"
 #include "util/u_math.h"
 
 enum qfile {
@@ -162,12 +162,12 @@ enum qop {
 };
 
 struct queued_qpu_inst {
-        struct simple_node link;
+        struct list_head link;
         uint64_t inst;
 };
 
 struct qinst {
-        struct simple_node link;
+        struct list_head link;
 
         enum qop op;
         struct qreg dst;
@@ -356,10 +356,10 @@ struct vc4_compile {
         struct qreg undef;
         enum qstage stage;
         uint32_t num_temps;
-        struct simple_node instructions;
+        struct list_head instructions;
         uint32_t immediates[1024];
 
-        struct simple_node qpu_inst_list;
+        struct list_head qpu_inst_list;
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 63f5eb22858..910c89dca79 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -88,7 +88,6 @@ is_lowerable_uniform(struct qinst *inst, int i)
 void
 qir_lower_uniforms(struct vc4_compile *c)
 {
-        struct simple_node *node;
         struct hash_table *ht =
                 _mesa_hash_table_create(c, index_hash, index_compare);
 
@@ -96,8 +95,7 @@ qir_lower_uniforms(struct vc4_compile *c)
          * than one uniform referenced, and add those uniform values to the
          * ht.
          */
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
                 uint32_t count = 0;
@@ -137,10 +135,9 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct qreg temp = qir_get_temp(c);
                 struct qreg unif = { QFILE_UNIF, max_index };
                 struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef);
-                insert_at_head(&c->instructions, &mov->link);
+                list_add(&mov->link, &c->instructions);
                 c->defs[temp.index] = mov;
-                foreach(node, &c->instructions) {
-                        struct qinst *inst = (struct qinst *)node;
+                list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                         uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
                         uint32_t count = 0;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index eeb8d3a21ff..99afe4b8798 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -47,14 +47,14 @@ queue(struct vc4_compile *c, uint64_t inst)
 {
         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
         q->inst = inst;
-        insert_at_tail(&c->qpu_inst_list, &q->link);
+        list_addtail(&q->link, &c->qpu_inst_list);
 }
 
 static uint64_t *
 last_inst(struct vc4_compile *c)
 {
         struct queued_qpu_inst *q =
-                (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
+                (struct queued_qpu_inst *)c->qpu_inst_list.prev;
         return &q->inst;
 }
 
@@ -117,11 +117,11 @@ fixup_raddr_conflict(struct vc4_compile *c,
                 return;
 
         if (mux0 == QPU_MUX_A) {
-                queue(c, qpu_a_MOV(qpu_rb(31), *src1));
-                *src1 = qpu_rb(31);
+                queue(c, qpu_a_MOV(qpu_rb(31), *src0));
+                *src0 = qpu_rb(31);
         } else {
-                queue(c, qpu_a_MOV(qpu_ra(31), *src1));
-                *src1 = qpu_ra(31);
+                queue(c, qpu_a_MOV(qpu_ra(31), *src0));
+                *src0 = qpu_ra(31);
         }
 }
 
@@ -144,7 +144,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 QPU_UNPACK_16B_TO_F32,
         };
 
-        make_empty_list(&c->qpu_inst_list);
+        list_inithead(&c->qpu_inst_list);
 
         switch (c->stage) {
         case QSTAGE_VERT:
@@ -170,10 +170,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 break;
         }
 
-        struct simple_node *node;
-        foreach(node, &c->instructions) {
-                struct qinst *qinst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
 #if 0
                 fprintf(stderr, "translating qinst to qpu: ");
                 qir_dump_inst(qinst);
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index f523b4c6fb0..19cbf7bb98c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -43,7 +43,7 @@ static bool debug;
 struct schedule_node_child;
 
 struct schedule_node {
-        struct simple_node link;
+        struct list_head link;
         struct queued_qpu_inst *inst;
         struct schedule_node_child *children;
         uint32_t child_count;
@@ -400,22 +400,21 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 }
 
 static void
-calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list)
+calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node;
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
         state.dir = F;
 
-        foreach(node, schedule_list)
-                calculate_deps(&state, (struct schedule_node *)node);
+        list_for_each_entry(struct schedule_node, node, schedule_list, link)
+                calculate_deps(&state, node);
 }
 
 static void
-calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list)
+calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node;
+        struct list_head *node;
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
@@ -507,15 +506,13 @@ get_instruction_priority(uint64_t inst)
 
 static struct schedule_node *
 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
-                               struct simple_node *schedule_list,
+                               struct list_head *schedule_list,
                                struct schedule_node *prev_inst)
 {
         struct schedule_node *chosen = NULL;
-        struct simple_node *node;
         int chosen_prio = 0;
 
-        foreach(node, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 uint64_t inst = n->inst->inst;
 
                 /* "An instruction must not read from a location in physical
@@ -596,14 +593,11 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 }
 
 static void
-dump_state(struct simple_node *schedule_list)
+dump_state(struct list_head *schedule_list)
 {
-        struct simple_node *node;
-
         uint32_t i = 0;
-        foreach(node, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
 
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 fprintf(stderr, "%3d: ", i++);
                 vc4_qpu_disasm(&n->inst->inst, 1);
                 fprintf(stderr, "\n");
@@ -639,7 +633,7 @@ compute_delay(struct schedule_node *n)
 }
 
 static void
-mark_instruction_scheduled(struct simple_node *schedule_list,
+mark_instruction_scheduled(struct list_head *schedule_list,
                            struct schedule_node *node,
                            bool war_only)
 {
@@ -658,16 +652,15 @@ mark_instruction_scheduled(struct simple_node *schedule_list,
 
                 child->parent_count--;
                 if (child->parent_count == 0)
-                        insert_at_head(schedule_list, &child->link);
+                        list_add(&child->link, schedule_list);
 
                 node->children[i].node = NULL;
         }
 }
 
 static void
-schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
+schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node, *t;
         struct choose_scoreboard scoreboard;
 
         /* We reorder the uniforms as we schedule instructions, so save the
@@ -693,14 +686,12 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
         }
 
         /* Remove non-DAG heads from the list. */
-        foreach_s(node, t, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
-
+        list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
                 if (n->parent_count != 0)
-                        remove_from_list(&n->link);
+                        list_del(&n->link);
         }
 
-        while (!is_empty_list(schedule_list)) {
+        while (!list_empty(schedule_list)) {
                 struct schedule_node *chosen =
                         choose_instruction_to_schedule(&scoreboard,
                                                        schedule_list,
@@ -724,7 +715,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                  * find an instruction to pair with it.
                  */
                 if (chosen) {
-                        remove_from_list(&chosen->link);
+                        list_del(&chosen->link);
                         mark_instruction_scheduled(schedule_list, chosen, true);
                         if (chosen->uniform != -1) {
                                 c->uniform_data[next_uniform] =
@@ -738,7 +729,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                                                                schedule_list,
                                                                chosen);
                         if (merge) {
-                                remove_from_list(&merge->link);
+                                list_del(&merge->link);
                                 inst = qpu_merge_inst(inst, merge->inst->inst);
                                 assert(inst != 0);
                                 if (merge->uniform != -1) {
@@ -813,16 +804,14 @@ void
 qpu_schedule_instructions(struct vc4_compile *c)
 {
         void *mem_ctx = ralloc_context(NULL);
-        struct simple_node schedule_list;
-        struct simple_node *node;
+        struct list_head schedule_list;
 
-        make_empty_list(&schedule_list);
+        list_inithead(&schedule_list);
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
-                foreach(node, &c->qpu_inst_list) {
-                        struct queued_qpu_inst *q =
-                                (struct queued_qpu_inst *)node;
+                list_for_each_entry(struct queued_qpu_inst, q,
+                                    &c->qpu_inst_list, link) {
                         vc4_qpu_disasm(&q->inst, 1);
                         fprintf(stderr, "\n");
                 }
@@ -831,7 +820,7 @@ qpu_schedule_instructions(struct vc4_compile *c)
 
         /* Wrap each instruction in a scheduler structure. */
         uint32_t next_uniform = 0;
-        while (!is_empty_list(&c->qpu_inst_list)) {
+        while (!list_empty(&c->qpu_inst_list)) {
                 struct queued_qpu_inst *inst =
                         (struct queued_qpu_inst *)c->qpu_inst_list.next;
                 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
@@ -844,16 +833,15 @@ qpu_schedule_instructions(struct vc4_compile *c)
                 } else {
                         n->uniform = -1;
                 }
-                remove_from_list(&inst->link);
-                insert_at_tail(&schedule_list, &n->link);
+                list_del(&inst->link);
+                list_addtail(&n->link, &schedule_list);
         }
         assert(next_uniform == c->num_uniforms);
 
         calculate_forward_deps(c, &schedule_list);
         calculate_reverse_deps(c, &schedule_list);
 
-        foreach(node, &schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
+        list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
                 compute_delay(n);
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index 1792becb08f..270832eae3a 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -50,9 +50,10 @@ vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
         free(query);
 }
 
-static void
+static boolean
 vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 {
+        return true;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index f40547b8154..3b0b890b66a 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -161,7 +161,6 @@ node_to_temp_priority(const void *in_a, const void *in_b)
 struct qpu_reg *
 vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 {
-        struct simple_node *node;
         struct node_to_temp_map map[c->num_temps];
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
@@ -189,9 +188,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (inst->dst.file == QFILE_TEMP) {
                         def[inst->dst.index] = ip;
                         use[inst->dst.index] = ip;
@@ -227,9 +224,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         /* Figure out our register classes and preallocated registers*/
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
diff --git a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
index 109724369d5..7f11fba2340 100644
--- a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
@@ -42,10 +42,8 @@ qir_reorder_uniforms(struct vc4_compile *c)
         uint32_t *uniform_index = NULL;
         uint32_t uniform_index_size = 0;
         uint32_t next_uniform = 0;
-        struct simple_node *node;
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         if (inst->src[i].file != QFILE_UNIF)
                                 continue;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 3f180d5845d..cab76406055 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -26,6 +26,7 @@
 #include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_surface.h"
+#include "util/u_upload_mgr.h"
 
 #include "vc4_screen.h"
 #include "vc4_context.h"
@@ -161,6 +162,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                 /* We need to align the box to utile boundaries, since that's
                  * what load/store operate on.
                  */
+                uint32_t orig_width = ptrans->box.width;
+                uint32_t orig_height = ptrans->box.height;
                 uint32_t box_start_x = ptrans->box.x & (utile_w - 1);
                 uint32_t box_start_y = ptrans->box.y & (utile_h - 1);
                 ptrans->box.width += box_start_x;
@@ -174,7 +177,9 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                 ptrans->layer_stride = ptrans->stride;
 
                 trans->map = malloc(ptrans->stride * ptrans->box.height);
-                if (usage & PIPE_TRANSFER_READ) {
+                if (usage & PIPE_TRANSFER_READ ||
+                    ptrans->box.width != orig_width ||
+                    ptrans->box.height != orig_height) {
                         vc4_load_tiled_image(trans->map, ptrans->stride,
                                              buf + slice->offset +
                                              box->z * rsc->cube_map_stride,
@@ -638,41 +643,37 @@ vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
  * was in user memory, it would be nice to not have uploaded it to a VBO
  * before translating.
  */
-void
-vc4_update_shadow_index_buffer(struct pipe_context *pctx,
-                               const struct pipe_index_buffer *ib)
+struct pipe_resource *
+vc4_get_shadow_index_buffer(struct pipe_context *pctx,
+                            const struct pipe_index_buffer *ib,
+                            uint32_t count,
+                            uint32_t *shadow_offset)
 {
-        struct vc4_resource *shadow = vc4_resource(ib->buffer);
-        struct vc4_resource *orig = vc4_resource(shadow->shadow_parent);
-        uint32_t count = shadow->base.b.width0 / 2;
-
-        if (shadow->writes == orig->writes)
-                return;
-
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_resource *orig = vc4_resource(ib->buffer);
         perf_debug("Fallback conversion for %d uint indices\n", count);
 
+        void *data;
+        struct pipe_resource *shadow_rsc = NULL;
+        u_upload_alloc(vc4->uploader, 0, count * 2,
+                       shadow_offset, &shadow_rsc, &data);
+        uint16_t *dst = data;
+
         struct pipe_transfer *src_transfer;
         uint32_t *src = pipe_buffer_map_range(pctx, &orig->base.b,
                                               ib->offset,
                                               count * 4,
                                               PIPE_TRANSFER_READ, &src_transfer);
 
-        struct pipe_transfer *dst_transfer;
-        uint16_t *dst = pipe_buffer_map_range(pctx, &shadow->base.b,
-                                              0,
-                                              count * 2,
-                                              PIPE_TRANSFER_WRITE, &dst_transfer);
-
         for (int i = 0; i < count; i++) {
                 uint32_t src_index = src[i];
                 assert(src_index <= 0xffff);
                 dst[i] = src_index;
         }
 
-        pctx->transfer_unmap(pctx, dst_transfer);
         pctx->transfer_unmap(pctx, src_transfer);
 
-        shadow->writes = orig->writes;
+        return shadow_rsc;
 }
 
 void
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index 2ed848bc7b9..ab8f5d3cd55 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -26,7 +26,7 @@
 #define VC4_RESOURCE_H
 
 #include "vc4_screen.h"
-#include "vc4_packet.h"
+#include "kernel/vc4_packet.h"
 #include "util/u_transfer.h"
 
 struct vc4_transfer {
@@ -45,7 +45,6 @@ struct vc4_resource_slice {
 struct vc4_surface {
         struct pipe_surface base;
         uint32_t offset;
-        uint32_t stride;
         uint8_t tiling;
 };
 
@@ -107,8 +106,10 @@ struct pipe_resource *vc4_resource_create(struct pipe_screen *pscreen,
                                           const struct pipe_resource *tmpl);
 void vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
                                          struct pipe_sampler_view *view);
-void vc4_update_shadow_index_buffer(struct pipe_context *pctx,
-                                    const struct pipe_index_buffer *ib);
+struct pipe_resource *vc4_get_shadow_index_buffer(struct pipe_context *pctx,
+                                                  const struct pipe_index_buffer *ib,
+                                                  uint32_t count,
+                                                  uint32_t *offset);
 void vc4_dump_surface(struct pipe_surface *psurf);
 
 #endif /* VC4_RESOURCE_H */
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 84aae918326..f63bead0fbb 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -175,6 +175,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_POLYGON_OFFSET_CLAMP:
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
                 return 0;
 
                 /* Stream output. */
@@ -322,6 +323,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
                 return 0;
         case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
         case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
@@ -458,7 +460,7 @@ vc4_screen_create(int fd)
         pscreen->is_format_supported = vc4_screen_is_format_supported;
 
         screen->fd = fd;
-        make_empty_list(&screen->bo_cache.time_list);
+        list_inithead(&screen->bo_cache.time_list);
 
         vc4_fence_init(screen);
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 60626285d4d..5992e371093 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -27,7 +27,7 @@
 #include "pipe/p_screen.h"
 #include "os/os_thread.h"
 #include "state_tracker/drm_driver.h"
-#include "vc4_qir.h"
+#include "util/list.h"
 
 struct vc4_bo;
 
@@ -61,13 +61,19 @@ struct vc4_screen {
 
         struct vc4_bo_cache {
                 /** List of struct vc4_bo freed, by age. */
-                struct simple_node time_list;
+                struct list_head time_list;
                 /** List of struct vc4_bo freed, per size, by age. */
-                struct simple_node *size_list;
+                struct list_head *size_list;
                 uint32_t size_list_size;
 
                 pipe_mutex lock;
+
+                uint32_t bo_size;
+                uint32_t bo_count;
         } bo_cache;
+
+        uint32_t bo_size;
+        uint32_t bo_count;
 };
 
 static inline struct vc4_screen *
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 2f72e722fc5..b58013dd2ee 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -39,11 +39,13 @@ vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
 {
         struct vc4_context *vc4 = dev->vc4;
         struct vc4_screen *screen = vc4->screen;
-        struct drm_gem_cma_object *obj = CALLOC_STRUCT(drm_gem_cma_object);
+        struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo);
+        struct drm_gem_cma_object *obj = &drm_bo->base;
         uint32_t size = align(bo->size, 4096);
 
-        obj->bo = bo;
+        drm_bo->bo = bo;
         obj->base.size = size;
+        obj->base.dev = dev;
         obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
         obj->paddr = simpenrose_hw_addr(obj->vaddr);
 
@@ -94,7 +96,7 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
                 struct drm_gem_cma_object *obj = exec->bo[i].bo;
-                struct vc4_bo *bo = obj->bo;
+                struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);
 
@@ -124,6 +126,7 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
         int ret;
 
         memset(&exec, 0, sizeof(exec));
+        list_inithead(&exec.unref_list);
 
         if (ctex && ctex->bo->simulator_winsys_map) {
 #if 0
@@ -176,8 +179,12 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
         if (ret)
                 return ret;
 
-        vc4_bo_unreference(&exec.exec_bo->bo);
-        free(exec.exec_bo);
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
+                                 unref_head) {
+		list_del(&bo->unref_head);
+                vc4_bo_unreference(&bo->bo);
+                free(bo);
+        }
 
         if (ctex && ctex->bo->simulator_winsys_map) {
                 for (int y = 0; y < ctex->base.b.height0; y++) {
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 1f0c6b67c0f..2bb36b253bb 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -43,6 +43,7 @@ struct vc4_exec_info;
 #define kfree(ptr) free(ptr)
 #define krealloc(ptr, size, args) realloc(ptr, size)
 #define roundup(x, y) align(x, y)
+#define round_up(x, y) align(x, y)
 #define max(x, y) MAX2(x, y)
 #define min(x, y) MiN2(x, y)
 #define BUG_ON(condition) assert(!(condition))
@@ -63,16 +64,27 @@ struct drm_device {
         uint32_t simulator_mem_next;
 };
 
-struct drm_gem_cma_object {
-        struct vc4_bo *bo;
+struct drm_gem_object {
+        uint32_t size;
+        struct drm_device *dev;
+};
 
-        struct {
-                uint32_t size;
-        } base;
+struct drm_gem_cma_object {
+        struct drm_gem_object base;
         uint32_t paddr;
         void *vaddr;
 };
 
+struct drm_vc4_bo {
+        struct drm_gem_cma_object base;
+        struct vc4_bo *bo;
+        struct list_head unref_head;
+};
+
+static inline struct drm_vc4_bo *to_vc4_bo(struct drm_gem_object *obj)
+{
+        return (struct drm_vc4_bo *)obj;
+}
 
 struct drm_gem_cma_object *
 drm_gem_cma_create(struct drm_device *dev, size_t size);
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 80e963ea2ee..4a1d4c3a4d6 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -304,24 +304,8 @@ vc4_set_index_buffer(struct pipe_context *pctx,
 
         if (ib) {
                 assert(!ib->user_buffer);
-
-                if (ib->index_size == 4) {
-                        struct pipe_resource tmpl = *ib->buffer;
-                        assert(tmpl.format == PIPE_FORMAT_R8_UNORM);
-                        assert(tmpl.height0 == 1);
-                        tmpl.width0 = (tmpl.width0 - ib->offset) / 2;
-                        struct pipe_resource *pshadow =
-                                vc4_resource_create(&vc4->screen->base, &tmpl);
-                        struct vc4_resource *shadow = vc4_resource(pshadow);
-                        pipe_resource_reference(&shadow->shadow_parent, ib->buffer);
-
-                        pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
-                        vc4->indexbuf.buffer = pshadow;
-                        vc4->indexbuf.index_size = 2;
-                } else {
-                        pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
-                        vc4->indexbuf.index_size = ib->index_size;
-                }
+                pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
+                vc4->indexbuf.index_size = ib->index_size;
                 vc4->indexbuf.offset = ib->offset;
         } else {
                 pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
@@ -538,6 +522,7 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
+                tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
                 tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
                 tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
                 tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
@@ -547,6 +532,8 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
                 clone->writes = shadow_parent->writes - 1;
+
+                assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
         so->texture = prsc;
         so->reference.count = 1;