143 files changed, 2957 insertions, 1574 deletions
diff --git a/src/gallium/drivers/freedreno/.gitignore b/src/gallium/drivers/freedreno/.gitignore
new file mode 100644
index 00000000000..150f5d19f5b
--- /dev/null
+++ b/src/gallium/drivers/freedreno/.gitignore
@@ -0,0 +1 @@
+ir3_compiler
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index baae9144005..74ef4168655 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -128,6 +128,7 @@ ir3_SOURCES := \
 	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
+	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_lower_if_else.c \
 	ir3/ir3_print.c \
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 74cbbf2edd8..e47bbff5643 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -171,8 +171,8 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd3_query_context_init(pctx);
 
-	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 24afbc9e956..e65a352e7f6 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -145,7 +145,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd3_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd3_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index e53e0c56c9a..7d6365bbb6d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -171,8 +171,8 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd4_query_context_init(pctx);
 
-	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index b9a28149722..bc62a5d9a4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -133,7 +133,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd4_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd4_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 571c8142bf7..418b71b95de 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -40,6 +40,8 @@
 #include "freedreno_gmem.h"
 #include "freedreno_util.h"
 
+#define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE)
+
 struct fd_vertex_stateobj;
 
 struct fd_texture_stateobj {
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 5bbe4016a2a..9d0cdd8e545 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -226,6 +226,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
 	case PIPE_CAP_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
@@ -238,6 +240,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -414,6 +421,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d55daeefe06..481859efb17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -40,6 +40,7 @@
 #include "freedreno_util.h"
 
 #include "ir3_compiler.h"
+#include "ir3_nir.h"
 #include "instr-a3xx.h"
 #include "ir3.h"
 
@@ -105,10 +106,10 @@ int main(int argc, char **argv)
 	const char *filename;
 	struct tgsi_token toks[65536];
 	struct tgsi_parse_context parse;
-	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
 	struct ir3_shader s;
 	struct ir3_shader_key key = {};
+	/* TODO cmdline option to target different gpus: */
 	unsigned gpu_id = 320;
 	const char *info;
 	void *ptr;
@@ -228,7 +229,12 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
-	s.tokens = toks;
+	if (fd_mesa_debug & FD_DBG_OPTMSGS)
+		tgsi_dump(toks, 0);
+
+	nir_shader *nir = ir3_tgsi_to_nir(toks);
+	s.compiler = ir3_compiler_create(gpu_id);
+	s.nir = ir3_optimize_nir(&s, nir, NULL);
 
 	v.key = key;
 	v.shader = &s;
@@ -246,11 +252,8 @@ int main(int argc, char **argv)
 		break;
 	}
 
-	/* TODO cmdline option to target different gpus: */
-	compiler = ir3_compiler_create(gpu_id);
-
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v);
+	ret = ir3_compile_shader_nir(s.compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 224f7806b3c..86afda4ba08 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -32,10 +32,6 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_strings.h"
-
-#include "nir/tgsi_to_nir.h"
 
 #include "freedreno_util.h"
 
@@ -123,97 +119,10 @@ struct ir3_compile {
 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
 static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
 
-static struct nir_shader *to_nir(struct ir3_compile *ctx,
-		const struct tgsi_token *tokens, struct ir3_shader_variant *so)
-{
-	static const nir_shader_compiler_options options = {
-			.lower_fpow = true,
-			.lower_fsat = true,
-			.lower_scmp = true,
-			.lower_flrp = true,
-			.lower_ffract = true,
-			.native_integers = true,
-	};
-	struct nir_lower_tex_options tex_options = {
-			.lower_rect = 0,
-	};
-	bool progress;
-
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		tex_options.saturate_s = so->key.fsaturate_s;
-		tex_options.saturate_t = so->key.fsaturate_t;
-		tex_options.saturate_r = so->key.fsaturate_r;
-		break;
-	case SHADER_VERTEX:
-		tex_options.saturate_s = so->key.vsaturate_s;
-		tex_options.saturate_t = so->key.vsaturate_t;
-		tex_options.saturate_r = so->key.vsaturate_r;
-		break;
-	}
-
-	if (ctx->compiler->gpu_id >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		tex_options.lower_txp = ~0;  /* lower all txp */
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-	}
-
-	struct nir_shader *s = tgsi_to_nir(tokens, &options);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	nir_opt_global_to_local(s);
-	nir_convert_to_ssa(s);
-	if (s->stage == MESA_SHADER_VERTEX) {
-		nir_lower_clip_vs(s, so->key.ucp_enables);
-	} else if (s->stage == MESA_SHADER_FRAGMENT) {
-		nir_lower_clip_fs(s, so->key.ucp_enables);
-	}
-	nir_lower_tex(s, &tex_options);
-	if (so->key.color_two_side)
-		nir_lower_two_sided_color(s);
-	nir_lower_idiv(s);
-	nir_lower_load_const_to_scalar(s);
-
-	do {
-		progress = false;
-
-		nir_lower_vars_to_ssa(s);
-		nir_lower_alu_to_scalar(s);
-		nir_lower_phis_to_scalar(s);
-
-		progress |= nir_copy_prop(s);
-		progress |= nir_opt_dce(s);
-		progress |= nir_opt_cse(s);
-		progress |= ir3_nir_lower_if_else(s);
-		progress |= nir_opt_algebraic(s);
-		progress |= nir_opt_constant_folding(s);
-
-	} while (progress);
-
-	nir_remove_dead_variables(s);
-	nir_validate_shader(s);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	return s;
-}
 
 static struct ir3_compile *
 compile_init(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 
@@ -239,7 +148,28 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
-	ctx->s = to_nir(ctx, tokens, so);
+	/* TODO: maybe generate some sort of bitmask of what key
+	 * lowers vs what shader has (ie. no need to lower
+	 * texture clamp lowering if no texture sample instrs)..
+	 * although should be done further up the stack to avoid
+	 * creating duplicate variants..
+	 */
+
+	if (ir3_key_lowers_nir(&so->key)) {
+		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+	} else {
+		/* fast-path for shader key that lowers nothing in NIR: */
+		ctx->s = so->shader->nir;
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
+			so->shader->id, so->id, so->type,
+			so->key.binning_pass, so->key.color_two_side,
+			so->key.half_precision);
+		nir_print_shader(ctx->s, stdout);
+	}
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
@@ -1954,8 +1884,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		case nir_texop_query_levels:
 			emit_tex_query_levels(ctx, tex);
 			break;
-		case nir_texop_samples_identical:
-			unreachable("nir_texop_samples_identical");
 		default:
 			emit_tex(ctx, tex);
 			break;
@@ -2170,6 +2098,8 @@ emit_stream_out(struct ir3_compile *ctx)
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
+	nir_metadata_require(impl, nir_metadata_block_index);
+
 	emit_cf_list(ctx, &impl->body);
 	emit_block(ctx, impl->end_block);
 
@@ -2499,7 +2429,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, so->shader->tokens);
+	ctx = compile_init(compiler, so);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..565b9c32c1d
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,153 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "freedreno_util.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "nir/tgsi_to_nir.h"
+
+struct nir_shader *
+ir3_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+	static const nir_shader_compiler_options options = {
+			.lower_fpow = true,
+			.lower_fsat = true,
+			.lower_scmp = true,
+			.lower_flrp = true,
+			.lower_ffract = true,
+			.native_integers = true,
+	};
+	return tgsi_to_nir(tokens, &options);
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+			key->ucp_enables | key->color_two_side;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key)
+{
+	struct nir_lower_tex_options tex_options = {
+			.lower_rect = 0,
+	};
+	bool progress;
+
+	if (key) {
+		switch (shader->type) {
+		case SHADER_FRAGMENT:
+		case SHADER_COMPUTE:
+			tex_options.saturate_s = key->fsaturate_s;
+			tex_options.saturate_t = key->fsaturate_t;
+			tex_options.saturate_r = key->fsaturate_r;
+			break;
+		case SHADER_VERTEX:
+			tex_options.saturate_s = key->vsaturate_s;
+			tex_options.saturate_t = key->vsaturate_t;
+			tex_options.saturate_r = key->vsaturate_r;
+			break;
+		}
+	}
+
+	if (shader->compiler->gpu_id >= 400) {
+		/* a4xx seems to have *no* sam.p */
+		tex_options.lower_txp = ~0;  /* lower all txp */
+	} else {
+		/* a3xx just needs to avoid sam.p for 3d tex */
+		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_convert_to_ssa);
+
+	if (key) {
+		if (s->stage == MESA_SHADER_VERTEX) {
+			OPT_V(s, nir_lower_clip_vs, key->ucp_enables);
+		} else if (s->stage == MESA_SHADER_FRAGMENT) {
+			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+		}
+		if (key->color_two_side) {
+			OPT_V(s, nir_lower_two_sided_color);
+		}
+	}
+
+	OPT_V(s, nir_lower_tex, &tex_options);
+	OPT_V(s, nir_lower_idiv);
+	OPT_V(s, nir_lower_load_const_to_scalar);
+
+	do {
+		progress = false;
+
+		OPT_V(s, nir_lower_vars_to_ssa);
+		OPT_V(s, nir_lower_alu_to_scalar);
+		OPT_V(s, nir_lower_phis_to_scalar);
+
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		progress |= OPT(s, ir3_nir_lower_if_else);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
+
+	} while (progress);
+
+	OPT_V(s, nir_remove_dead_variables);
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_sweep(s);
+
+	return s;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index 9950782dc38..534199d3744 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -32,6 +32,13 @@
 #include "glsl/nir/nir.h"
 #include "glsl/nir/shader_enums.h"
 
+#include "ir3_shader.h"
+
 bool ir3_nir_lower_if_else(nir_shader *shader);
 
+struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key);
+
 #endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 7b565332256..7d17f426ad3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -39,7 +39,7 @@
 
 #include "ir3_shader.h"
 #include "ir3_compiler.h"
-
+#include "ir3_nir.h"
 
 static void
 delete_variant(struct ir3_shader_variant *v)
@@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	v->key = key;
 	v->type = shader->type;
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(shader->tokens, 0);
-	}
-
 	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
@@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader)
 		v = v->next;
 		delete_variant(t);
 	}
-	free((void *)shader->tokens);
+	ralloc_free(shader->nir);
 	free(shader);
 }
 
@@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx,
 	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump tgsi: type=%d", shader->type);
+		tgsi_dump(cso->tokens, 0);
+	}
+	nir_shader *nir = ir3_tgsi_to_nir(cso->tokens);
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
 	shader->stream_output = cso->stream_output;
 	if (fd_mesa_debug & FD_DBG_SHADERDB) {
 		/* if shader-db run, create a standard variant immediately
 		 * (as otherwise nothing will trigger the shader to be
 		 * actually compiled)
 		 */
-		static struct ir3_shader_key key = {};
+		static struct ir3_shader_key key = {0};
 		ir3_shader_variant(shader, key);
 	}
 	return shader;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index cf99a4c05ed..b3c28a41387 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -230,6 +230,8 @@ struct ir3_shader_variant {
 	struct ir3_shader *shader;
 };
 
+typedef struct nir_shader nir_shader;
+
 struct ir3_shader {
 	enum shader_t type;
 
@@ -240,7 +242,7 @@ struct ir3_shader {
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
-	const struct tgsi_token *tokens;
+	nir_shader *nir;
 	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 1ed685188db..2adaee30fb9 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -195,7 +195,6 @@ struct i915_rasterizer_state {
 
    unsigned light_twoside : 1;
    unsigned st;
-   enum interp_mode color_interp;
 
    unsigned LIS4;
    unsigned LIS7;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index a5b161882cd..e2a493bc1b5 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -254,6 +254,11 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -264,6 +269,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 6ba9646f7ab..b54a9fbf4f9 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -423,7 +423,7 @@ i915_prepare_vertex_sampling(struct i915_context *i915)
          for (j = view->u.tex.first_level; j <= tex->last_level; j++) {
             mip_offsets[j] = i915_texture_offset(i915_tex, j , 0 /* FIXME depth */);
             row_stride[j] = i915_tex->stride;
-            img_stride[j] = 0; /* FIXME */;
+            img_stride[j] = 0; /* FIXME */
          }
 
          draw_set_mapped_texture(i915->draw,
@@ -920,7 +920,6 @@ i915_create_rasterizer_state(struct pipe_context *pipe,
    struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
 
    cso->templ = *rasterizer;
-   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    cso->light_twoside = rasterizer->light_twoside;
    cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
    cso->ds[1].f = rasterizer->offset_scale;
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 7ad88a1ce01..bd0f448f645 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -57,7 +57,6 @@ static uint find_mapping(const struct i915_fragment_shader* fs, int unit)
 static void calculate_vertex_layout(struct i915_context *i915)
 {
    const struct i915_fragment_shader *fs = i915->fs;
-   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
    struct vertex_info vinfo;
    boolean texCoords[I915_TEX_UNITS], colors[2], fog, needW, face;
    uint i;
@@ -107,12 +106,12 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* pos */
    src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
    if (needW) {
-      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
       vinfo.attrib[0].emit = EMIT_4F;
    }
    else {
-      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZ;
       vinfo.attrib[0].emit = EMIT_3F;
    }
@@ -123,21 +122,21 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* primary color */
    if (colors[0]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_COLOR;
    }
 
    /* secondary color */
    if (colors[1]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
    }
 
    /* fog coord, not fog blend factor */
    if (fog) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
    }
 
@@ -147,7 +146,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
       if (texCoords[i]) {
          hwtc = TEXCOORDFMT_4D;
          src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, fs->generic_mapping[i]);
-         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       }
       else {
          hwtc = TEXCOORDFMT_NOT_PRESENT;
@@ -164,7 +163,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
        * module by adding an extra shader output.
        */
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_CONSTANT, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4));
       vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4);
    }
@@ -185,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
    "vertex_layout",
    calculate_vertex_layout,
-   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
+   I915_NEW_FS | I915_NEW_VS
 };
 
 
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 9d5195129b7..079872f4306 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
                  const struct ilo_dev *dev,
                  struct intel_winsys *winsys)
 {
-   int i;
+   unsigned i;
 
    assert(ilo_is_zeroed(builder, sizeof(*builder)));
 
@@ -366,7 +366,7 @@ ilo_builder_init(struct ilo_builder *builder,
 void
 ilo_builder_reset(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++)
       ilo_builder_writer_reset(builder, i);
@@ -382,7 +382,7 @@ ilo_builder_reset(struct ilo_builder *builder)
 bool
 ilo_builder_begin(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) {
       if (!ilo_builder_writer_alloc_and_map(builder, i)) {
@@ -407,7 +407,7 @@ struct intel_bo *
 ilo_builder_end(struct ilo_builder *builder, unsigned *used)
 {
    struct ilo_builder_writer *bat;
-   int i;
+   unsigned i;
 
    ilo_builder_batch_patch_sba(builder);
 
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 2a00cf1c93c..6bcd0bcb8f5 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -189,8 +189,9 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
     * These must be called last as u_upload/u_blitter are clients of the pipe
     * context.
     */
-   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, 16,
-         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER);
+   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024,
+         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
    if (!ilo->uploader) {
       ilo_context_destroy(&ilo->base);
       return NULL;
diff --git a/src/gallium/drivers/ilo/ilo_gpgpu.c b/src/gallium/drivers/ilo/ilo_gpgpu.c
index 9a2ca007f80..b7415901a88 100644
--- a/src/gallium/drivers/ilo/ilo_gpgpu.c
+++ b/src/gallium/drivers/ilo/ilo_gpgpu.c
@@ -92,7 +92,7 @@ ilo_launch_grid(struct pipe_context *pipe,
    input_buf.buffer_size =
       ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_INPUT_SIZE);
    if (input_buf.buffer_size) {
-      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, input,
+      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, 16, input,
             &input_buf.buffer_offset, &input_buf.buffer);
    }
 
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index cfa2fb41152..d5a82ce80ae 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -463,6 +463,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -476,6 +478,11 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index d89765a9d23..8dc2d38e039 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -376,7 +376,7 @@ finalize_cbuf_state(struct ilo_context *ilo,
       if (cbuf->cso[i].resource)
          continue;
 
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, 16,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
       cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
@@ -426,12 +426,12 @@ finalize_index_buffer(struct ilo_context *ilo)
       unsigned hw_offset;
 
       if (vec->ib.state.user_buffer) {
-         u_upload_data(ilo->uploader, 0, size,
+         u_upload_data(ilo->uploader, 0, size, 16,
                vec->ib.state.user_buffer + offset,
                &hw_offset, &vec->ib.hw_resource);
       } else {
          u_upload_buffer(ilo->uploader, 0,
-               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               vec->ib.state.offset + offset, size, 16, vec->ib.state.buffer,
                &hw_offset, &vec->ib.hw_resource);
       }
 
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index 5250115a893..f46126e8427 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -266,7 +266,7 @@ fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc, real_src[4];
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    tsrc_transpose(idx, real_src);
 
@@ -319,7 +319,7 @@ fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
    const int grf_subreg = (idx.val32 & 1) * 16;
    struct toy_src src;
    struct toy_dst real_dst[4];
-   int i;
+   unsigned i;
 
    if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
        grf >= fcc->first_attr_grf)
@@ -350,7 +350,7 @@ fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -396,7 +396,7 @@ fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc,
    struct toy_src desc;
    struct toy_inst *inst;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -1168,7 +1168,7 @@ fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst)
 {
    struct toy_dst dst[4];
    struct toy_src src[4];
-   int i;
+   unsigned i;
 
    tdst_transpose(inst->dst, dst);
    tsrc_transpose(inst->src[0], src);
@@ -1257,7 +1257,7 @@ fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst)
    }
    else {
       struct toy_src src[4];
-      int i;
+      unsigned i;
 
       tsrc_transpose(inst->src[0], src);
       /* mask out killed pixels */
@@ -1583,7 +1583,7 @@ fs_write_fb(struct fs_compile_context *fcc)
 static void
 fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi)
 {
-   int i;
+   unsigned i;
 
    sh->out.count = tgsi->num_outputs;
    for (i = 0; i < tgsi->num_outputs; i++) {
@@ -1603,7 +1603,7 @@ static void
 fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
                    bool flatshade)
 {
-   int i;
+   unsigned i;
 
    sh->in.count = tgsi->num_inputs;
    for (i = 0; i < tgsi->num_inputs; i++) {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index a29baab10c1..0df0afc706b 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -126,7 +126,7 @@ vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc,
    tc_MOV(tc, block_offsets, idx);
 
    msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ;
-   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;;
+   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;
    msg_len = 2;
 
    desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false,
@@ -522,7 +522,7 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
       if (num_coords >= 3) {
          struct toy_dst tmp, max;
          struct toy_src abs_coords[3];
-         int i;
+         unsigned i;
 
          tmp = tc_alloc_tmp(tc);
          max = tdst_writemask(tmp, TOY_WRITEMASK_W);
@@ -804,7 +804,7 @@ static int
 vs_collect_outputs(struct vs_compile_context *vcc, struct toy_src *outs)
 {
    const struct toy_tgsi *tgsi = &vcc->tgsi;
-   int i;
+   unsigned i;
 
    for (i = 0; i < vcc->shader->out.count; i++) {
       const int slot = vcc->output_map[i];
diff --git a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
index b725375fb67..1874faa6be3 100644
--- a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
+++ b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
@@ -70,7 +70,7 @@ struct linear_scan {
 static void
 linear_scan_free_regs(struct linear_scan *ls, int reg, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++)
       ls->free_regs[ls->num_free_regs++] = reg + count - 1 - i;
@@ -221,7 +221,7 @@ linear_scan_spill(struct linear_scan *ls,
 static void
 linear_scan_spill_range(struct linear_scan *ls, int first, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++) {
       struct linear_scan_live_interval *interval = &ls->intervals[first + i];
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
index d38585f1475..9a7140b9a9b 100644
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -1593,7 +1593,7 @@ ra_get_type(struct toy_tgsi *tgsi, const struct tgsi_full_instruction *tgsi_inst
       tgsi_inst->Src[operand].Register.File;
    switch (file) {
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       type = TOY_TYPE_D;
       break;
@@ -1834,7 +1834,7 @@ ra_get_src_indirect(struct toy_tgsi *tgsi,
       src = tsrc_null();
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       is_resource = true;
       /* fall through */
@@ -1918,7 +1918,7 @@ ra_get_src(struct toy_tgsi *tgsi,
       need_vrf = true;
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       assert(!s->Register.Dimension);
       src = tsrc_imm_d(s->Register.Index);
@@ -2256,7 +2256,7 @@ parse_declaration(struct toy_tgsi *tgsi,
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_PREDICATE:
    case TGSI_FILE_ADDRESS:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       /* nothing to do */
       break;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 0a52642e395..9029d2a4180 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -63,8 +63,7 @@ enum lp_interp {
    LP_INTERP_LINEAR,
    LP_INTERP_PERSPECTIVE,
    LP_INTERP_POSITION,
-   LP_INTERP_FACING,
-   LP_INTERP_ZERO
+   LP_INTERP_FACING
 };
 
 struct lp_shader_input {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 9dcc102e758..62d99bbaac8 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -108,28 +108,22 @@ struct llvmpipe_context {
    struct vertex_info vertex_info;
    
    /** Which vertex shader output slot contains color */
-   uint8_t color_slot[2];
+   int8_t color_slot[2];
 
    /** Which vertex shader output slot contains bcolor */
-   uint8_t bcolor_slot[2];
+   int8_t bcolor_slot[2];
 
    /** Which vertex shader output slot contains point size */
-   uint8_t psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   uint8_t viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which geometry shader output slot contains layer */
-   uint8_t layer_slot;
+   int8_t layer_slot;
 
    /** A fake frontface output for unfilled primitives */
-   uint8_t face_slot;
-
-   /** Which output slot is used for the fake vp index info */
-   uint8_t fake_vpindex_slot;
-
-   /** Which output slot is used for the fake layer info */
-   uint8_t fake_layer_slot;
+   int8_t face_slot;
 
    /** Depth format and bias settings. */
    boolean floating_point_depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c19f9318006..db45cbbb057 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -115,7 +115,7 @@ struct lp_rast_plane {
    int32_t dcdy;
 
    /* one-pixel sized trivial reject offsets for each plane */
-   int64_t eo;
+   uint32_t eo;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c9b9221d87c..232c8599e42 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_4(task, arg2);
 }
 
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
 
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<3)-1;
-   lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<4)-1;
-   lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
-{
-   lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 #define NR_PLANES 3
 
-
-
-
-
-
-
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                                0xffff & ~out[i].mask);
 }
 
-
-
-
-
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
@@ -471,6 +433,254 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 }
 
 #undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include <altivec.h>
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_32(int c,
+               int cdiff,
+               int dcdx,
+               int dcdy,
+               unsigned *outmask,
+               unsigned *partmask)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *outmask |= vec_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = (__m128i) vec_splats(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = vec_add_epi32(cstep0, cio4);
+      cstep1 = vec_add_epi32(cstep1, cio4);
+      cstep2 = vec_add_epi32(cstep2, cio4);
+      cstep3 = vec_add_epi32(cstep3, cio4);
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *partmask |= vec_movemask_epi8(result);
+   }
+}
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return vec_movemask_epi8(result);
+}
+
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#define NR_PLANES 3
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = vec_splats((unsigned char) 0);
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   __m128i vshuf_mask0;
+   __m128i vshuf_mask1;
+   __m128i vshuf_mask2;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
+#else
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
+#endif
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = vec_sub_epi32(zero, dcdx);
+
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
+   rej4 = vec_slli_epi32(rej4, 2);
+
+   /*
+    * Adjust so we can just check the sign bit (< 0 comparison),
+    * instead of having to do a less efficient <= 0 comparison
+    */
+   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
+   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
+
+   dcdx2 = vec_add_epi32(dcdx, dcdx);
+   dcdx3 = vec_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = vec_add_epi32(cx, rej4);
+         __m128i rej_masks = vec_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (vec_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
+            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
+            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
+
+            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
+            __m128i c_01 = vec_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
+            __m128i c_23 = vec_packs_epi32(c_2, c_3);
+            __m128i c_0123 = vec_packs_epi16(c_01, c_23);
+
+            unsigned mask = vec_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
+      }
+
+      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+#undef NR_PLANES
+
+#else
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_32_3_16(task, arg);
+}
+
 #endif
 
 
@@ -512,7 +722,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
-#ifdef PIPE_ARCH_SSE
+#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
 #undef BUILD_MASKS
 #undef BUILD_MASK_LINEAR
 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 52f6e999683..e0aea94205e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
       const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
       const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
       const int64_t cio = IMUL64(ei, 4) - 1;
 
       BUILD_MASKS(c[j] + cox,
@@ -182,7 +182,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
          const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
          const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
          const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
          const int64_t cio = IMUL64(ei, 16) - 1;
 
          BUILD_MASKS(c[j] + cox,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 899f28da7d3..e29b008c7e8 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -301,6 +301,13 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index ddbb88eb107..bd850519468 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -486,6 +486,11 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
                                    depth,
                                    stencil);
 
+   /*
+    * XXX: should make a full mask here for things like D24X8,
+    * otherwise we'll do a read-modify-write clear later which
+    * should be unnecessary.
+    */
    zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
                                        zmask32,
                                        smask8);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 4451284c303..80acd74bddd 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -105,10 +105,10 @@ struct lp_setup_context
    float pixel_offset;
    float line_width;
    float point_size;
-   uint8_t psize_slot;
-   uint8_t viewport_index_slot;
-   uint8_t layer_slot;
-   uint8_t face_slot;
+   int8_t psize_slot;
+   int8_t viewport_index_slot;
+   int8_t layer_slot;
+   int8_t face_slot;
 
    struct pipe_framebuffer_state fb;
    struct u_rect framebuffer;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index fac1cd61d77..a0de599c9c6 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -644,19 +644,25 @@ try_setup_line( struct lp_setup_context *setup,
    line->inputs.layer = layer;
    line->inputs.viewport_index = viewport_index;
 
+   /*
+    * XXX: this code is mostly identical to the one in lp_setup_tri, except it
+    * uses 4 planes instead of 3. Could share the code (including the sse
+    * assembly, in fact we'd get the 4th plane for free).
+    * The only difference apart from storing the 4th plane would be some
+    * different shuffle for calculating dcdx/dcdy.
+    */
    for (i = 0; i < 4; i++) {
 
-      /* half-edge constants, will be interated over the whole render
+      /* half-edge constants, will be iterated over the whole render
        * target.
        */
       plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
 
-      
-      /* correct for top-left vs. bottom-left fill convention.  
-       */         
+      /* correct for top-left vs. bottom-left fill convention.
+       */
       if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane[i].c++;            
+         plane[i].c++;
       }
       else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b1671dd0ae2..358da442ea7 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
 
 #if defined(PIPE_ARCH_SSE)
 #include <emmintrin.h>
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include <altivec.h>
+#include "util/u_pwr8.h"
 #endif
 
 static inline int
@@ -387,25 +390,21 @@ do_triangle_ccw(struct lp_setup_context *setup,
    plane = GET_PLANES(tri);
 
 #if defined(PIPE_ARCH_SSE)
-   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
-       setup->fb.height <= MAX_FIXED_LENGTH32 &&
-       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
-       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+   if (1) {
       __m128i vertx, verty;
       __m128i shufx, shufy;
-      __m128i dcdx, dcdy, c;
-      __m128i unused;
+      __m128i dcdx, dcdy;
+      __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
+      __m128i c01, c23, unused;
       __m128i dcdx_neg_mask;
       __m128i dcdy_neg_mask;
       __m128i dcdx_zero_mask;
-      __m128i top_left_flag;
-      __m128i c_inc_mask, c_inc;
+      __m128i top_left_flag, c_dec;
       __m128i eo, p0, p1, p2;
       __m128i zero = _mm_setzero_si128();
-      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
 
-      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
-      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
+      vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
+      verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
 
       shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
       shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
@@ -419,42 +418,161 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
 
-      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
-                                _mm_and_si128(dcdx_zero_mask,
-                                              _mm_xor_si128(dcdy_neg_mask,
-                                                            top_left_flag)));
-
-      c_inc = _mm_srli_epi32(c_inc_mask, 31);
-
-      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
-                        mm_mullo_epi32(dcdy, verty));
+      c_dec = _mm_or_si128(dcdx_neg_mask,
+                           _mm_and_si128(dcdx_zero_mask,
+                                         _mm_xor_si128(dcdy_neg_mask,
+                                                       top_left_flag)));
 
-      c = _mm_add_epi32(c, c_inc);
+      /*
+       * 64 bit arithmetic.
+       * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
+       */
+      cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
+      cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
+      c02 = _mm_sub_epi64(cdx02, cdy02);
+      c13 = _mm_sub_epi64(cdx13, cdy13);
+      c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(2,2,0,0)));
+      c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(3,3,1,1)));
+
+      /*
+       * Useful for very small fbs/tris (or fewer subpixel bits) only:
+       * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+       *                   mm_mullo_epi32(dcdy, verty));
+       *
+       * c = _mm_sub_epi32(c, c_dec);
+       */
 
       /* Scale up to match c:
        */
       dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
       dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
 
-      /* Calculate trivial reject values:
+      /*
+       * Calculate trivial reject values:
+       * Note eo cannot overflow even if dcdx/dcdy would already have
+       * 31 bits (which they shouldn't have). This is because eo
+       * is never negative (albeit if we rely on that need to be careful...)
        */
       eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
                          _mm_and_si128(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
 
+      /*
+       * Pointless transpose which gets undone immediately in
+       * rasterization.
+       * It is actually difficult to do away with it - would essentially
+       * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
+       * for this then would need to depend on the number of planes.
+       * The transpose is quite special here due to c being 64bit...
+       * The store has to be unaligned (unless we'd make the plane size
+       * a multiple of 128), and of course storing eo separately...
+       */
+      c01 = _mm_unpacklo_epi64(c02, c13);
+      c23 = _mm_unpackhi_epi64(c02, c13);
+      transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
+                         &p0, &p1, &p2, &unused);
+      _mm_storeu_si128((__m128i *)&plane[0], p0);
+      plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[1], p1);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
+      plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[2], p2);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
+      plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+   } else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+   /*
+    * XXX this code is effectively disabled for all practical purposes,
+    * as the allowed fb size is tiny if FIXED_ORDER is 8.
+    */
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+       setup->fb.height <= MAX_FIXED_LENGTH32 &&
+       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+      unsigned int bottom_edge;
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i_union vshuf_mask;
+      __m128i zero = vec_splats((unsigned char) 0);
+      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      vshuf_mask.i[0] = 0x07060504;
+      vshuf_mask.i[1] = 0x0B0A0908;
+      vshuf_mask.i[2] = 0x03020100;
+      vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+      vshuf_mask.i[0] = 0x00010203;
+      vshuf_mask.i[1] = 0x0C0D0E0F;
+      vshuf_mask.i[2] = 0x04050607;
+      vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+      /* vertex x coords */
+      vertx = vec_load_si128((const uint32_t *) position->x);
+      /* vertex y coords */
+      verty = vec_load_si128((const uint32_t *) position->y);
+
+      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+      dcdx = vec_sub_epi32(verty, shufy);
+      dcdy = vec_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+      top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+      c_inc_mask = vec_or(dcdx_neg_mask,
+                                vec_and(dcdx_zero_mask,
+                                              vec_xor(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+                        vec_mullo_epi32(dcdy, verty));
+
+      c = vec_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+                         vec_and(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
       /* Pointless transpose which gets undone immediately in
        * rasterization:
        */
       transpose4_epi32(&c, &dcdx, &dcdy, &eo,
                        &p0, &p1, &p2, &unused);
 
-#define STORE_PLANE(plane, vec) do {                 \
-         _mm_store_si128((__m128i *)&temp_vec, vec); \
-         plane.c    = (int64_t)temp_vec[0];          \
-         plane.dcdx = temp_vec[1];                   \
-         plane.dcdy = temp_vec[2];                   \
-         plane.eo   = temp_vec[3];                   \
+#define STORE_PLANE(plane, vec) do {                  \
+         vec_store_si128((uint32_t *)&temp_vec, vec); \
+         plane.c    = (int64_t)temp_vec[0];           \
+         plane.dcdx = temp_vec[1];                    \
+         plane.dcdy = temp_vec[2];                    \
+         plane.eo   = temp_vec[3];                    \
       } while(0)
 
       STORE_PLANE(plane[0], p0);
@@ -473,17 +591,17 @@ do_triangle_ccw(struct lp_setup_context *setup,
       plane[2].dcdx = position->dy20;
   
       for (i = 0; i < 3; i++) {
-         /* half-edge constants, will be interated over the whole render
+         /* half-edge constants, will be iterated over the whole render
           * target.
           */
          plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
-               IMUL64(plane[i].dcdy, position->y[i]);
+                      IMUL64(plane[i].dcdy, position->y[i]);
 
          /* correct for top-left vs. bottom-left fill convention.
-          */         
+          */
          if (plane[i].dcdx < 0) {
             /* both fill conventions want this - adjust for left edges */
-            plane[i].c++;            
+            plane[i].c++;
          }
          else if (plane[i].dcdx == 0) {
             if (setup->bottom_edge_rule == 0){
@@ -517,19 +635,19 @@ do_triangle_ccw(struct lp_setup_context *setup,
    }
 
    if (0) {
-      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+      debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[0].c,
                    plane[0].dcdx,
                    plane[0].dcdy,
                    plane[0].eo);
-      
-      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[1].c,
                    plane[1].dcdx,
                    plane[1].dcdy,
                    plane[1].eo);
-      
-      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[2].c,
                    plane[2].dcdx,
                    plane[2].dcdy,
@@ -590,7 +708,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 static inline uint32_t 
 floor_pot(uint32_t n)
 {
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
    if (n == 0)
       return 0;
 
@@ -738,9 +856,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
          ei[i] = (plane[i].dcdy - 
                   plane[i].dcdx - 
-                  plane[i].eo) << TILE_ORDER;
+                  (int64_t)plane[i].eo) << TILE_ORDER;
 
-         eo[i] = plane[i].eo << TILE_ORDER;
+         eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
          xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
          ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
       }
@@ -932,12 +1050,12 @@ rotate_fixed_position_12( struct fixed_position* position )
 /**
  * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void triangle_cw(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -953,12 +1071,12 @@ static void triangle_cw( struct lp_setup_context *setup,
 }
 
 
-static void triangle_ccw( struct lp_setup_context *setup,
-                          const float (*v0)[4],
-                          const float (*v1)[4],
-                          const float (*v2)[4])
+static void triangle_ccw(struct lp_setup_context *setup,
+                         const float (*v0)[4],
+                         const float (*v1)[4],
+                         const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -969,12 +1087,12 @@ static void triangle_ccw( struct lp_setup_context *setup,
 /**
  * Draw triangle whether it's CW or CCW.
  */
-static void triangle_both( struct lp_setup_context *setup,
-			   const float (*v0)[4],
-			   const float (*v1)[4],
-			   const float (*v2)[4] )
+static void triangle_both(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
    struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
 
    if (lp_context->active_statistics_queries &&
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index f5bcfb2b511..34961cbbac5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -48,21 +48,26 @@
 static void
 compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct tgsi_shader_info *fsInfo = &llvmpipe->fs->info.base;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
    int vs_index;
    uint i;
 
    draw_prepare_shader_outputs(llvmpipe->draw);
 
-   llvmpipe->color_slot[0] = 0;
-   llvmpipe->color_slot[1] = 0;
-   llvmpipe->bcolor_slot[0] = 0;
-   llvmpipe->bcolor_slot[1] = 0;
-   llvmpipe->viewport_index_slot = 0;
-   llvmpipe->layer_slot = 0;
-   llvmpipe->face_slot = 0;
-   llvmpipe->psize_slot = 0;
+   /*
+    * Those can't actually be 0 (because pos is always at 0).
+    * But use ints anyway to avoid confusion (in vs outputs, they
+    * can very well be at pos 0).
+    */
+   llvmpipe->color_slot[0] = -1;
+   llvmpipe->color_slot[1] = -1;
+   llvmpipe->bcolor_slot[0] = -1;
+   llvmpipe->bcolor_slot[1] = -1;
+   llvmpipe->viewport_index_slot = -1;
+   llvmpipe->layer_slot = -1;
+   llvmpipe->face_slot = -1;
+   llvmpipe->psize_slot = -1;
 
    /*
     * Match FS inputs against VS outputs, emitting the necessary
@@ -73,60 +78,49 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
    vinfo->num_attribs = 0;
 
    vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION,
-                                      0);
+                                      TGSI_SEMANTIC_POSITION, 0);
 
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
 
-   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
+   for (i = 0; i < fsInfo->num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
-
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.base.input_semantic_name[i],
-                                         lpfs->info.base.input_semantic_index[i]);
+                                         fsInfo->input_semantic_name[i],
+                                         fsInfo->input_semantic_index[i]);
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
-          lpfs->info.base.input_semantic_index[i] < 2) {
-         int idx = lpfs->info.base.input_semantic_index[i];
-         llvmpipe->color_slot[idx] = vinfo->num_attribs;
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+          fsInfo->input_semantic_index[i] < 2) {
+         int idx = fsInfo->input_semantic_index[i];
+         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
       }
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
-         llvmpipe->face_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
+         llvmpipe->face_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       /*
        * For vp index and layer, if the fs requires them but the vs doesn't
-       * provide them, store the slot - we'll later replace the data directly
-       * with zero (as required by ARB_fragment_layer_viewport). This is
-       * because draw itself just redirects them to whatever was at output 0.
-       * We'll also store the real vpindex/layer slot for setup use.
+       * provide them, draw (vbuf) will give us the required 0 (slot -1).
+       * (This means in this case we'll also use those slots in setup, which
+       * isn't necessary but they'll contain the correct (0) value.)
        */
-      } else if (lpfs->info.base.input_semantic_name[i] ==
+      } else if (fsInfo->input_semantic_name[i] ==
                  TGSI_SEMANTIC_VIEWPORT_INDEX) {
-         if (vs_index >= 0) {
-            llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_vpindex_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
-         if (vs_index >= 0) {
-            llvmpipe->layer_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_layer_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+      } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else {
          /*
-          * Emit the requested fs attribute for all but position.
+          * Note that we'd actually want to skip position (as we won't use
+          * the attribute in the fs) but can't. The reason is that we don't
+          * actually have a input/output map for setup (even though it looks
+          * like we do...). Could adjust for this though even without a map
+          * (in llvmpipe_create_fs_state()).
           */
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -137,8 +131,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          TGSI_SEMANTIC_BCOLOR, i);
 
       if (vs_index >= 0) {
-         llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -148,29 +142,29 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                       TGSI_SEMANTIC_PSIZE, 0);
 
    if (vs_index >= 0) {
-      llvmpipe->psize_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      llvmpipe->psize_slot = (int)vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
    }
 
    /* Figure out if we need viewport index (if it wasn't already in fs input) */
-   if (llvmpipe->viewport_index_slot == 0) {
+   if (llvmpipe->viewport_index_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_VIEWPORT_INDEX,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot =(int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
    /* Figure out if we need layer (if it wasn't already in fs input) */
-   if (llvmpipe->layer_slot == 0) {
+   if (llvmpipe->layer_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_LAYER,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -197,10 +191,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
    }
       
-   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
-                          LP_NEW_FS |
+   if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_VS))
-      compute_vertex_info( llvmpipe );
+      compute_vertex_info(llvmpipe);
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_FRAMEBUFFER |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 079083e9601..83ff97659fb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2695,34 +2695,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
 
       switch (shader->info.base.input_interpolate[i]) {
       case TGSI_INTERPOLATE_CONSTANT:
-	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
       case TGSI_INTERPOLATE_LINEAR:
-	 shader->inputs[i].interp = LP_INTERP_LINEAR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_LINEAR;
+         break;
       case TGSI_INTERPOLATE_PERSPECTIVE:
-	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
       case TGSI_INTERPOLATE_COLOR:
-	 shader->inputs[i].interp = LP_INTERP_COLOR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
       default:
-	 assert(0);
-	 break;
+         assert(0);
+         break;
       }
 
       switch (shader->info.base.input_semantic_name[i]) {
       case TGSI_SEMANTIC_FACE:
-	 shader->inputs[i].interp = LP_INTERP_FACING;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_FACING;
+         break;
       case TGSI_SEMANTIC_POSITION:
-	 /* Position was already emitted above
-	  */
-	 shader->inputs[i].interp = LP_INTERP_POSITION;
-	 shader->inputs[i].src_index = 0;
-	 continue;
+         /* Position was already emitted above
+          */
+         shader->inputs[i].interp = LP_INTERP_POSITION;
+         shader->inputs[i].src_index = 0;
+         continue;
       }
 
+      /* XXX this is a completely pointless index map... */
       shader->inputs[i].src_index = i+1;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index d7ba5c8ad8e..6a4fbbbf202 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm,
    /* Potentially modify it according to twoside, etc:
     */
    if (key->twoside) {
-      if (vert_attr == key->color_slot && key->bcolor_slot > 0)
+      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
          lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
-      else if (vert_attr == key->spec_slot && key->bspec_slot > 0)
+      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
          lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
    }
 }
@@ -602,13 +602,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
           */
          break;
 
-      case LP_INTERP_ZERO:
-         /*
-          * The information we get from the output is bogus, replace it
-          * with zero.
-          */
-         emit_constant_coef4(gallivm, args, slot+1, args->bld.zero);
-         break;
       case LP_INTERP_FACING:
          emit_facing_coef(gallivm, args, slot+1);
          break;
@@ -879,13 +872,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
    key->pad = 0;
    memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
    for (i = 0; i < key->num_inputs; i++) {
-      if (key->inputs[i].interp == LP_INTERP_CONSTANT) {
-         if (key->inputs[i].src_index == lp->fake_vpindex_slot ||
-             key->inputs[i].src_index == lp->fake_layer_slot) {
-            key->inputs[i].interp = LP_INTERP_ZERO;
-         }
-      }
-      else if (key->inputs[i].interp == LP_INTERP_COLOR) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
          if (lp->rasterizer->flatshade)
             key->inputs[i].interp = LP_INTERP_CONSTANT;
          else
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 6cee6fe5eb5..9ad244482de 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -17,10 +17,10 @@ struct lp_setup_variant_list_item
 struct lp_setup_variant_key {
    unsigned size:16;
    unsigned num_inputs:8;
-   unsigned color_slot:8;
-   unsigned bcolor_slot:8;
-   unsigned spec_slot:8;
-   unsigned bspec_slot:8;
+   int color_slot:8;
+   int bcolor_slot:8;
+   int spec_slot:8;
+   int bspec_slot:8;
    unsigned flatshade_first:1;
    unsigned pixel_center_half:1;
    unsigned twoside:1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 7b19174f345..9139b83f05a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -184,7 +184,7 @@ add_blend_test(struct gallivm_state *gallivm,
 
    LLVMBuildStore(builder, res, res_ptr);
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index a30f35c8149..02a63193af5 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -140,7 +140,7 @@ add_conv_test(struct gallivm_state *gallivm,
       LLVMBuildStore(builder, dst[i], ptr);
    }
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index d09a0ab0610..d1fdd75495f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -390,6 +390,9 @@ enum SVSemantic
    SV_VERTEX_STRIDE,
    SV_INVOCATION_INFO,
    SV_THREAD_KILL,
+   SV_BASEVERTEX,
+   SV_BASEINSTANCE,
+   SV_DRAWID,
    SV_UNDEFINED,
    SV_LAST
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index b49bf9d53bc..4504240ac5e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -124,6 +124,7 @@ struct nv50_ir_prog_info
    union {
       struct {
          uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+         bool usesDrawParameters;
       } vp;
       struct {
          uint8_t inputPatchSize;
@@ -160,8 +161,9 @@ struct nv50_ir_prog_info
       uint8_t clipDistances;     /* number of clip distance outputs */
       uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
       uint16_t ucpBase;          /* base address for UCPs */
-      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint16_t drawInfoBase;     /* base address for draw parameters */
       uint8_t pointSize;         /* output index for PointSize */
       uint8_t instanceId;        /* system value index of InstanceID */
       uint8_t vertexId;          /* system value index of VertexID */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index e9ddd366391..ec74e7ac811 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F()
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
    emitFMZ  (0x2c, 1);
+   emitField(0x29, 1, insn->subOp);
    emitRND  (0x27, rnd, 0x2a);
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
    emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 1d4f0d92f6b..0b28047e22b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
 
       // for 8/16 source types, the byte/word is in subOp. word 1 is
       // represented as 2.
-      code[1] |= i->subOp << 0x17;
+      if (!isFloatType(i->sType))
+         code[1] |= i->subOp << 0x17;
+      else
+         code[1] |= i->subOp << 0x18;
 
       if (sat)
          code[0] |= 0x20;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index b23386040a7..7b313f3c39c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const
          x |= 2;
       return x;
    }
+   case TGSI_OPCODE_PK2H:
+      return 0x3;
+   case TGSI_OPCODE_UP2H:
+      return 0x1;
    default:
       break;
    }
@@ -348,7 +352,7 @@ static nv50_ir::DataFile translateFile(uint file)
    case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
    case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
    case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_NULL:
    default:
@@ -377,6 +381,9 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
+   case TGSI_SEMANTIC_BASEVERTEX: return nv50_ir::SV_BASEVERTEX;
+   case TGSI_SEMANTIC_BASEINSTANCE: return nv50_ir::SV_BASEINSTANCE;
+   case TGSI_SEMANTIC_DRAWID:     return nv50_ir::SV_DRAWID;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
@@ -449,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_ATOMUMAX:
    case TGSI_OPCODE_UBFE:
    case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_I2D:
@@ -513,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const
    case TGSI_OPCODE_DSGE:
    case TGSI_OPCODE_DSLT:
    case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_PK2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_U2F:
    case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_F32;
    case TGSI_OPCODE_I2D:
    case TGSI_OPCODE_U2D:
@@ -861,7 +871,7 @@ bool Source::scanSource()
    clipVertexOutput = -1;
 
    textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
-   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
 
    info->immd.bufSize = 0;
 
@@ -1128,6 +1138,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       case TGSI_SEMANTIC_SAMPLEPOS:
          info->prop.fp.sampleInterp = 1;
          break;
+      case TGSI_SEMANTIC_BASEVERTEX:
+      case TGSI_SEMANTIC_BASEINSTANCE:
+      case TGSI_SEMANTIC_DRAWID:
+         info->prop.vp.usesDrawParameters = true;
+         break;
       default:
          break;
       }
@@ -1144,6 +1159,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          }
       }
       break;
+/*
    case TGSI_FILE_RESOURCE:
       for (i = first; i <= last; ++i) {
          resources[i].target = decl->Resource.Resource;
@@ -1151,6 +1167,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          resources[i].slot = i;
       }
       break;
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       for (i = first; i <= last; ++i)
          textureViews[i].target = decl->SamplerView.Resource;
@@ -1216,11 +1233,13 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (src.isIndirect(0))
             mainTempsInLMem = true;
       } else
+/*
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
       } else
+*/
       if (src.getFile() == TGSI_FILE_OUTPUT) {
          if (src.isIndirect(0)) {
             // We don't know which one is accessed, just mark everything for
@@ -1271,9 +1290,11 @@ Instruction::getTexture(const tgsi::Source *code, int s) const
    unsigned int r;
 
    switch (getSrc(s).getFile()) {
+/*
    case TGSI_FILE_RESOURCE:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->resources.at(r).target);
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->textureViews.at(r).target);
@@ -1639,8 +1660,6 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          // don't load masked inputs, won't be assigned a slot
          if (!ptr && !(info->in[idx].mask & (1 << swz)))
             return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
-            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
          return interpolate(src, c, shiftAddress(ptr));
       } else
       if (prog->getType() == Program::TYPE_GEOMETRY) {
@@ -1681,7 +1700,7 @@ Converter::acquireDst(int d, int c)
    const int idx = dst.getIndex(0);
    const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
-   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
       return NULL;
 
    if (dst.isIndirect(0) ||
@@ -2799,6 +2818,21 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
          mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
       break;
+   case TGSI_OPCODE_PK2H:
+      val0 = getScratch();
+      val1 = getScratch();
+      mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0));
+      mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1));
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp3(OP_INSBF, TYPE_U32, dst0[c], val1, mkImm(0x1010), val0);
+      break;
+   case TGSI_OPCODE_UP2H:
+      src0 = fetchSrc(0, 0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         geni = mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F16, src0);
+         geni->subOp = c & 1;
+      }
+      break;
    case TGSI_OPCODE_EMIT:
       /* export the saved viewport index */
       if (viewport != NULL) {
@@ -3252,7 +3286,7 @@ Converter::handleUserClipPlanes()
 
    for (c = 0; c < 4; ++c) {
       for (i = 0; i < info->io.genUserClip; ++i) {
-         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
                                 TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
          Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
          if (c == 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e67bf3eca84..6530078b938 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1576,6 +1576,17 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
+   case SV_BASEVERTEX:
+   case SV_BASEINSTANCE:
+   case SV_DRAWID:
+      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
+                      bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.auxCBSlot,
+                                   TYPE_U32,
+                                   prog->driver->io.drawInfoBase +
+                                   4 * (sv - SV_BASEVERTEX)),
+                      NULL);
+      break;
    default:
       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index c2842c2186f..f5c590eef10 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -676,23 +676,22 @@ ConstantFolding::expr(Instruction *i,
    switch (i->op) {
    case OP_MAD:
    case OP_FMA: {
-      i->op = OP_ADD;
+      ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 
-      /* Move the immediate to the second arg, otherwise the ADD operation
-       * won't be emittable
-       */
-      i->setSrc(1, i->getSrc(0));
+      // Move the immediate into position 1, where we know it might be
+      // emittable. However it might not be anyways, as there may be other
+      // restrictions, so move it into a separate LValue.
+      bld.setPosition(i, false);
+      i->op = OP_ADD;
+      i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
       i->setSrc(0, i->getSrc(2));
       i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
-      ImmediateValue src0;
       if (i->src(0).getImmediate(src0))
-         expr(i, src0, *i->getSrc(1)->asImm());
-      if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
-         bld.setPosition(i, false);
-         i->setSrc(1, bld.loadImm(NULL, res.data.u32));
-      }
+         expr(i, src0, src1);
+      else
+         opnd(i, src1, 1);
       break;
    }
    case OP_PFETCH:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 19637ce33f5..014c652eede 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -295,6 +295,9 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_SAMPLE_INDEX:   return 0;
    case SV_SAMPLE_POS:     return 0;
    case SV_SAMPLE_MASK:    return 0;
+   case SV_BASEVERTEX:     return 0;
+   case SV_BASEINSTANCE:   return 0;
+   case SV_DRAWID:         return 0;
    default:
       return 0xffffffff;
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 670b0c8b135..cd44aa1e1d9 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -112,7 +112,7 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
    info.bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info.bin.source = tokens;
 
-   info.io.ucpCBSlot = 15;
+   info.io.auxCBSlot = 15;
    info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
 
    info.io.resInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 58df5ee847f..809e971a678 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -114,6 +114,11 @@ struct nouveau_vp3_decoder {
    unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride;
 
    unsigned bsp_idx, vp_idx, ppp_idx;
+
+   /* End of the bsp bo where new data should be appended between one begin/end
+    * frame.
+    */
+   char *bsp_ptr;
 };
 
 struct comm {
@@ -208,11 +213,15 @@ nouveau_vp3_load_firmware(struct nouveau_vp3_decoder *dec,
                           enum pipe_video_profile profile,
                           unsigned chipset);
 
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec);
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes);
+
 uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes);
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc);
 
 void
 nouveau_vp3_vp_caps(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
index 692772e49d1..a3d07deeb18 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -230,20 +230,58 @@ nouveau_vp3_fill_picparm_h264_bsp(struct nouveau_vp3_decoder *dec,
    return caps | 3;
 }
 
+static inline struct strparm_bsp *strparm_bsp(struct nouveau_vp3_decoder *dec)
+{
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   return (struct strparm_bsp *)(bsp_bo->map + 0x100);
+}
+
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec)
+{
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+
+   dec->bsp_ptr = (void *)str_bsp;
+   memset(str_bsp, 0, 0x80);
+   dec->bsp_ptr += 0x100;
+   /* Reserved for picparm_vp */
+   dec->bsp_ptr += 0x300;
+   /* Reserved for comm */
+#if !NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->bsp_ptr, 0, 0x200);
+#endif
+   dec->bsp_ptr += 0x200;
+}
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes)
+{
+#ifndef NDEBUG
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+#endif
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   int i;
+
+   for (i = 0; i < num_buffers; ++i) {
+      assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+      memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
+      dec->bsp_ptr += num_bytes[i];
+      str_bsp->w0[0] += num_bytes[i];
+   }
+}
+
 uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes)
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc)
 {
    enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   unsigned comm_seq = dec->fence_seq;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   char *bsp;
    uint32_t endmarker, caps;
-   struct strparm_bsp *str_bsp;
-   int i;
-
-   bsp = bsp_bo->map;
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   char *bsp = bsp_bo->map;
    /*
     * 0x000..0x100: picparm_bsp
     * 0x200..0x500: picparm_vp
@@ -277,34 +315,21 @@ nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
    caps |= 1 << 17; // enable watchdog
    caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have
    caps |= 0 << 19; // if enabled, use crypto crap?
-   bsp += 0x100;
 
-   str_bsp = (struct strparm_bsp *)bsp;
-   memset(str_bsp, 0, 0x80);
-   str_bsp->w0[0] = 16;
+   str_bsp = strparm_bsp(dec);
    str_bsp->w1[0] = 0x1;
-   bsp += 0x100;
-   /* Reserved for picparm_vp */
-   bsp += 0x300;
-   /* Reserved for comm */
-#if !NOUVEAU_VP3_DEBUG_FENCE
-   memset(bsp, 0, 0x200);
-#endif
-   bsp += 0x200;
-   for (i = 0; i < num_buffers; ++i) {
-      memcpy(bsp, data[i], num_bytes[i]);
-      bsp += num_bytes[i];
-      str_bsp->w0[0] += num_bytes[i];
-   }
 
    /* Append end sequence */
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
-   bsp += 4;
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   str_bsp->w0[0] += 16;
+
+   dec->bsp_ptr = NULL;
 
    return caps;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 098d6e499fa..7b0d0745766 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -208,17 +208,16 @@ nv30_render_release_vertices(struct vbuf_render *render)
 
 static const struct {
    unsigned emit;
-   unsigned interp;
    unsigned vp30;
    unsigned vp40;
    unsigned ow40;
 } vroute [] = {
-   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 },
-   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, INTERP_LINEAR     , 3, 1, 0x00000001 },
-   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
-   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
-   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
+   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, 0, 0, 0x00000000 },
+   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, 3, 1, 0x00000001 },
+   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, 1, 3, 0x00000004 },
+   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, 5, 5, 0x00000010 },
+   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, 6, 6, 0x00000020 },
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, 8, 7, 0x00004000 },
 };
 
 static bool
@@ -247,7 +246,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    if (emit == EMIT_OMIT)
       return false;
 
-   draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
+   draw_emit_vertex_attr(vinfo, emit, attrib);
    format = draw_translate_vinfo_format(emit);
 
    r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 854f70cf34c..d9c940232c4 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -157,6 +157,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_VERTEX_BUFFERS:
    case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -174,6 +176,11 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -265,6 +272,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -308,6 +316,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 2cebcd99423..712d00ed2d3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -134,9 +134,11 @@ struct nv50_context {
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
    uint16_t constbuf_valid[3];
+   uint16_t constbuf_coherent[3];
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -148,6 +150,7 @@ struct nv50_context {
 
    struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS];
    unsigned num_textures[3];
+   uint32_t textures_coherent[3];
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index a4b8ddfda95..888d62e1c52 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -148,7 +148,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
    for (m = 0, i = 0; i < info->numInputs; ++i) {
       switch (info->in[i].sn) {
       case TGSI_SEMANTIC_POSITION:
-      case TGSI_SEMANTIC_FACE:
          continue;
       default:
          m += info->in[i].flat ? 0 : 1;
@@ -166,9 +165,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
          for (c = 0; c < 4; ++c)
             if (info->in[i].mask & (1 << c))
                info->in[i].slot[c] = nintp++;
-      } else
-      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
-         info->in[i].slot[0] = 255;
       } else {
          unsigned j = info->in[i].flat ? m++ : n++;
 
@@ -335,7 +331,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;
 
-   info->io.ucpCBSlot = 15;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 272e1d45bff..56c67e0ddfb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,11 +213,17 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -300,6 +307,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index de655971b66..cb040439139 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -664,6 +664,17 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
       if (old)
          nv50_screen_tic_unlock(nv50->screen, old);
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nv50->textures_coherent[s] |= 1 << i;
+         else
+            nv50->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nv50->textures_coherent[s] &= ~(1 << i);
+      }
+
       pipe_sampler_view_reference(&nv50->textures[s][i], views[i]);
    }
 
@@ -847,13 +858,19 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (res) {
       nv50->constbuf[s][i].offset = cb->buffer_offset;
       nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nv50->constbuf_coherent[s] |= 1 << i;
+      else
+         nv50->constbuf_coherent[s] &= ~(1 << i);
    } else {
       nv50->constbuf_valid[s] &= ~(1 << i);
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
@@ -1003,6 +1020,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    if (!vb) {
       nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot);
       nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot);
+      nv50->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
       return;
    }
 
@@ -1015,9 +1033,16 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
             nv50->vbo_constant |= 1 << dst_index;
          else
             nv50->vbo_constant &= ~(1 << dst_index);
+         nv50->vtxbufs_coherent &= ~(1 << dst_index);
       } else {
          nv50->vbo_user &= ~(1 << dst_index);
          nv50->vbo_constant &= ~(1 << dst_index);
+
+         if (vb[i].buffer &&
+             vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+            nv50->vtxbufs_coherent |= (1 << dst_index);
+         else
+            nv50->vtxbufs_coherent &= ~(1 << dst_index);
       }
    }
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 7de2f1f1d0f..60fa2bc06a8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,8 +636,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
          PUSH_DATA (push, prim);
 
-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
          nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
 
          switch (index_size) {
          case 4:
@@ -765,7 +765,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    bool tex_dirty = false;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nv50->vb_elt_first = info->min_index + info->index_bias;
@@ -794,27 +794,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
-      uint32_t valid = nv50->constbuf_valid[s];
-
-      while (valid && !nv50->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nv50->constbuf[s][i].user)
-            continue;
-
-         res = nv50->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = true;
-      }
+      if (nv50->constbuf_coherent[s])
+         nv50->cb_dirty = true;
    }
 
    /* If there are any coherent constbufs, flush the cache */
@@ -825,15 +807,10 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 3 && !tex_dirty; ++s) {
-      for (i = 0; i < nv50->num_textures[s] && !tex_dirty; ++i) {
-         if (!nv50->textures[s][i] ||
-             nv50->textures[s][i]->texture->target != PIPE_BUFFER)
-            continue;
-         if (nv50->textures[s][i]->texture->flags &
-             PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            tex_dirty = true;
-      }
+      if (nv50->textures_coherent[s])
+         tex_dirty = true;
    }
+
    if (tex_dirty) {
       BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
       PUSH_DATA (push, 0x20);
@@ -853,12 +830,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nv50->num_vtxbufs && !nv50->base.vbo_dirty; ++i) {
-      if (!nv50->vtxbuf[i].buffer)
-         continue;
-      if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = true;
-   }
+   nv50->base.vbo_dirty |= !!nv50->vtxbufs_coherent;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
index dbde1bfcebe..4fe0e05c96b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
       struct nouveau_bo *tmp_bo = NULL;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
@@ -106,8 +106,9 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       return -1;
    }
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   nouveau_vp3_bsp_begin(dec);
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index b2060d1fa53..4daa57d47bb 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -230,27 +230,43 @@ locn_0f_ts:
  * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = index_bias
- * parm[4] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 5n + 0] = count
+ * parm[2 + 5n + 1] = instance_count
+ * parm[2 + 5n + 2] = start
+ * parm[2 + 5n + 3] = index_bias
+ * parm[2 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
  */
 .section #mme9097_draw_elts_indirect
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dei_draw_again:
    parm $r3 /* count */
    parm $r2 /* instance_count */
    parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
    parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
    braz $r2 #dei_end
-   parm $r5 /* start_instance */
-   read $r6 0x50d /* VB_ELEMENT_BASE */
-   read $r7 0x50e /* VB_INSTANCE_BASE */
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r4
    send $r5
    maddr 0x446
    send $r4
    mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dei_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
@@ -260,46 +276,218 @@ dei_again:
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r2 #dei_again
    mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dei_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dei_draw_again
+   mov $r6 (add $r6 1)
+   read $r6 0xd00
+   read $r7 0xd01
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r6
    send $r7
    exit maddr 0x446
    send $r6
-dei_end:
-   exit
-   nop
 
 /* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT:
  *
  * NOTE: Saves and restores VB_INSTANCE_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 4n + 0] = count
+ * parm[2 + 4n + 1] = instance_count
+ * parm[2 + 4n + 2] = start
+ * parm[2 + 4n + 3] = start_instance
  */
 .section #mme9097_draw_arrays_indirect
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dai_draw_again:
    parm $r2 /* count */
    parm $r3 /* instance_count */
    parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
-   parm $r4 send $r4 /* start_instance */
    braz $r3 #dai_end
-   read $r6 0x50e /* VB_INSTANCE_BASE */
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
    maddr 0x50e /* VB_INSTANCE_BASE */
-   mov $r5 0x1
    send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dai_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
    maddr 0x35e /* VERTEX_BUFFER_COUNT */
    send $r2
-   mov $r3 (sub $r3 $r5)
+   mov $r3 (sub $r3 $r4)
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r3 #dai_again
-   mov $r1 (extrinsrt $r1 $r5 0 1 26) /* set INSTANCE_NEXT */
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dai_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dai_draw_again
+   mov $r6 (add $r6 1)
    exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+
+/* NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT
+ *
+ * NOTE: Saves and restores VB_ELEMENT,INSTANCE_BASE.
+ * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 5n + 0] = count
+ * parm[3 + 5n + 1] = instance_count
+ * parm[3 + 5n + 2] = start
+ * parm[3 + 5n + 3] = index_bias
+ * parm[3 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
+ * SCRATCH[2] = draws left
+ */
+.section #mme9097_draw_elts_indirect_count
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #deic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz $r3 #deic_runout
+   send $r5
+deic_draw_again:
+   parm $r3 /* count */
+   parm $r2 /* instance_count */
+   parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
+   parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   braz $r2 #deic_end
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
+   send $r4
+   send $r5
+   maddr 0x446
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+deic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x5f8 /* INDEX_BATCH_COUNT */
+   send $r3 /* count */
+   mov $r2 (sub $r2 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r2 #deic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+deic_end:
+   read $r5 0xd02
+   mov $r5 (add $r5 -1)
+   braz $r5 #deic_runout_check
+   mov $r7 (add $r7 -1)
+   maddr 0xd02
+   send $r5
+   branz $r7 #deic_draw_again
+   mov $r6 (add $r6 1)
+deic_restore:
+   read $r6 0xd00
+   read $r7 0xd01
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r6
-dai_end:
-   exit
-   nop
+   send $r7
+   exit maddr 0x446
+   send $r6
+deic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+deic_runout_check:
+   branz annul $r7 #deic_runout
+   bra annul #deic_restore
+
+/* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT:
+ *
+ * NOTE: Saves and restores VB_INSTANCE_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 4n + 0] = count
+ * parm[3 + 4n + 1] = instance_count
+ * parm[3 + 4n + 2] = start
+ * parm[3 + 4n + 3] = start_instance
+ *
+ * SCRATCH[0] = VB_INSTANCE_BASE
+ */
+.section #mme9097_draw_arrays_indirect_count
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   maddr 0xd00
+   parm $r6 send $r5 /* start_drawid, save VB_INSTANCE_BASE */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #daic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz annul $r3 #daic_runout
+daic_draw_again:
+   parm $r2 /* count */
+   parm $r3 /* instance_count */
+   parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
+   braz $r3 #daic_end
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x50e /* VB_INSTANCE_BASE */
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+daic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x35e /* VERTEX_BUFFER_COUNT */
+   send $r2
+   mov $r3 (sub $r3 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r3 #daic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+daic_end:
+   mov $r5 (add $r5 -1)
+   braz $r5 #daic_runout_check
+   mov $r7 (add $r7 -1)
+   branz $r7 #daic_draw_again
+   mov $r6 (add $r6 1)
+daic_restore:
+   read $r5 0xd00
+   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+daic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+daic_runout_check:
+   branz annul $r7 #daic_runout
+   bra annul #daic_restore
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index bac9042c2df..bf8625e0584 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -125,22 +125,33 @@ uint32_t mme9097_tep_select[] = {
 };
 
 uint32_t mme9097_draw_elts_indirect[] = {
+	0x01434615,
+/* 0x0007: dei_draw_again */
+	0x01438715,
+	0x07400021,
+	0x00003041,
+	0x00003841,
+	0x00000601,
+/* 0x0018: dei_again */
+	0x00000701,
 	0x00000301,
+/* 0x0020: dei_end */
 	0x00000201,
 	0x017dc451,
-/* 0x000e: dei_again */
 	0x00002431,
-	0x0005d007,
-	0x00000501,
-/* 0x001b: dei_end */
-	0x01434615,
-	0x01438715,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
+	0x00002531,
+	0x00002841,
+	0x00003041,
 	0x05434021,
 	0x00002041,
 	0x00002841,
 	0x01118021,
 	0x00002041,
 	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x017e0021,
@@ -149,37 +160,175 @@ uint32_t mme9097_draw_elts_indirect[] = {
 	0x01614071,
 	0xfffe9017,
 	0xd0410912,
+	0xffffff11,
+	0xfff9b817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
 	0x05434021,
 	0x00003041,
 	0x00003841,
 	0x011180a1,
 	0x00003041,
-	0x00000091,
-	0x00000011,
 };
 
 uint32_t mme9097_draw_arrays_indirect[] = {
+/* 0x0003: dai_draw_again */
+	0x01438515,
+	0x00000601,
+	0x00000701,
 	0x00000201,
+/* 0x0011: dai_again */
 	0x00000301,
-/* 0x0009: dai_again */
 	0x00d74451,
+/* 0x0019: dai_end */
+	0x0004d807,
 	0x00002431,
-/* 0x0013: dai_end */
-	0x0003d807,
-	0x01438615,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
+	0x00002041,
+	0x00003041,
 	0x01438021,
-	0x00004511,
 	0x00002041,
+	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x00d78021,
 	0x00001041,
-	0x00055b10,
+	0x00051b10,
 	0x01614071,
 	0xfffe9817,
-	0xd0414912,
+	0xd0410912,
+	0xffffff11,
+	0xfffa7817,
+	0x00007611,
 	0x014380a1,
+	0x00002841,
+};
+
+uint32_t mme9097_draw_elts_indirect_count[] = {
+	0x01434615,
+	0x01438715,
+	0x07400021,
+/* 0x000d: deic_draw_again */
 	0x00003041,
-	0x00000091,
-	0x00000011,
+	0x00003841,
+	0x00000601,
+	0x00000701,
+/* 0x001e: deic_again */
+	0x00000501,
+	0x0005ad10,
+/* 0x0026: deic_end */
+	0x000b2807,
+	0x007f4312,
+/* 0x002e: deic_restore */
+	0x000a9817,
+	0x00002841,
+/* 0x0035: deic_runout */
+	0x00000301,
+/* 0x003b: deic_runout_check */
+	0x00000201,
+	0x017dc451,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
+	0x00002531,
+	0x00002841,
+	0x00003041,
+	0x05434021,
+	0x00002041,
+	0x00002841,
+	0x01118021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x017e0021,
+	0x00001841,
+	0x00051210,
+	0x01614071,
+	0xfffe9017,
+	0xd0410912,
+	0x03408515,
+	0xffffed11,
+	0x0004e807,
+	0xffffff11,
+	0x03408021,
+	0x00002841,
+	0xfff87817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
+	0x05434021,
+	0x00003041,
+	0x00003841,
+	0x011180a1,
+	0x00003041,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffeb837,
+	0xfffc8027,
+};
+
+uint32_t mme9097_draw_arrays_indirect_count[] = {
+	0x01438515,
+	0x03400021,
+/* 0x0009: daic_draw_again */
+	0x00002e31,
+	0x00000701,
+	0x00000501,
+/* 0x0017: daic_again */
+	0x0005ad10,
+	0x00086807,
+/* 0x001f: daic_end */
+	0x007f4312,
+	0x0007d837,
+/* 0x0024: daic_restore */
+/* 0x0027: daic_runout */
+	0x00000201,
+	0x00000301,
+/* 0x002c: daic_runout_check */
+	0x00d74451,
+	0x0004d807,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
+	0x00002041,
+	0x00003041,
+	0x01438021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x00d78021,
+	0x00001041,
+	0x00051b10,
+	0x01614071,
+	0xfffe9817,
+	0xd0410912,
+	0xffffed11,
+	0x00032807,
+	0xffffff11,
+	0xfff9f817,
+	0x00007611,
+	0x03400515,
+	0x014380a1,
+	0x00002841,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffef837,
+	0xfffdc027,
 };
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 39b73ecb0c2..12195489691 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -134,10 +134,12 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
+   uint16_t constbuf_coherent[6];
    bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t constant_vbos;
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -149,6 +151,7 @@ struct nvc0_context {
    struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
    unsigned num_textures[6];
    uint32_t textures_dirty[6];
+   uint32_t textures_coherent[6];
    struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[6];
    uint16_t samplers_dirty[6];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index bf2798a44a0..27c026b8b30 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -29,4 +29,8 @@
 
 #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT			0x00003840
 
+#define NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT		0x00003848
+
+#define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT		0x00003850
+
 #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 67a25acf778..c3b53621630 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -55,7 +55,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si)
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
-   case TGSI_SEMANTIC_FACE:         return 0x3fc;
    default:
       assert(!"invalid TGSI input semantic");
       return ~0;
@@ -285,8 +284,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    case PIPE_PRIM_TRIANGLES:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
       break;
    case PIPE_PRIM_QUADS:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -295,6 +292,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode = ~0;
       return;
    }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
    if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 
@@ -533,8 +534,9 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    info->bin.source = (void *)prog->pipe.tokens;
 
    info->io.genUserClip = prog->vp.num_ucps;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = 256;
-   info->io.ucpCBSlot = 15;
+   info->io.drawInfoBase = 256 + 128;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
@@ -583,6 +585,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    prog->num_barriers = info->numBarriers;
 
    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+   prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
 
    if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
       info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 9c45e7b3e31..8b8d221edfc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -42,6 +42,7 @@ struct nvc0_program {
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
       bool need_vertex_id;
+      bool need_draw_parameters;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index a70d524ea85..1bed0162baf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -470,10 +470,7 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 {
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
 
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
    PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 39954464b9c..33dd17ebeca 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -184,6 +184,11 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -206,6 +211,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -311,6 +318,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
@@ -1025,6 +1033,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
    PUSH_DATA (push, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 5e84ca9e0ea..dc02b011bdf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -317,6 +317,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
 
       if (!targ->clean)
          nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+      nouveau_pushbuf_space(push, 0, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
       PUSH_DATA (push, 1);
       PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 41a824a97a0..24a6c222dd5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -554,6 +554,17 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
          continue;
       nvc0->textures_dirty[s] |= 1 << i;
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nvc0->textures_coherent[s] |= 1 << i;
+         else
+            nvc0->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nvc0->textures_coherent[s] &= ~(1 << i);
+      }
+
       if (old) {
          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
          nvc0_screen_tic_unlock(nvc0->screen, old);
@@ -596,6 +607,17 @@ nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
             continue;
          nvc0->textures_dirty[s] |= 1 << i;
 
+         if (views[p] && views[p]->texture) {
+            struct pipe_resource *res = views[p]->texture;
+            if (res->target == PIPE_BUFFER &&
+                (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+               nvc0->textures_coherent[s] |= 1 << i;
+            else
+               nvc0->textures_coherent[s] &= ~(1 << i);
+         } else {
+            nvc0->textures_coherent[s] &= ~(1 << i);
+         }
+
          if (nvc0->textures[s][i]) {
             struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
             nouveau_bufctx_reset(bctx, bin + i);
@@ -842,14 +864,20 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (cb) {
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
       nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      if (res && res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nvc0->constbuf_coherent[s] |= 1 << i;
+      else
+         nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
    else {
       nvc0->constbuf_valid[s] &= ~(1 << i);
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
 }
 
@@ -1009,6 +1037,7 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
     if (!vb) {
        nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot);
        nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot);
+       nvc0->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
        return;
     }
 
@@ -1021,9 +1050,16 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
              nvc0->constant_vbos |= 1 << dst_index;
           else
              nvc0->constant_vbos &= ~(1 << dst_index);
+          nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        } else {
           nvc0->vbo_user &= ~(1 << dst_index);
           nvc0->constant_vbos &= ~(1 << dst_index);
+
+          if (vb[i].buffer &&
+              vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+             nvc0->vtxbufs_coherent |= (1 << dst_index);
+          else
+             nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        }
     }
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 54443bdccc0..ad79d1cbb9c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -787,7 +787,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
    }
 
    while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (push, mode);
       BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -807,19 +807,31 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nv04_resource *buf = nv04_resource(info->indirect);
-   unsigned size;
-   const uint32_t offset = buf->offset + info->indirect_offset;
+   struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
+   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
+   uint32_t offset = buf->offset + info->indirect_offset;
 
    /* must make FIFO wait for engines idle before continuing to process */
-   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
+   if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
+       (buf_count && buf_count->fence_wr &&
+        !nouveau_fence_signalled(buf_count->fence_wr))) {
       IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);
+   }
+
+   /* Queue things up to let the macros write params to the driver constbuf */
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 512);
+   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
 
-   PUSH_SPACE(push, 8);
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
-      size = 5 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 1 + size / 4);
+      size = 5;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
    } else {
       if (nvc0->state.index_bias) {
          /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -827,15 +839,59 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
          nvc0->state.index_bias = 0;
       }
-      size = 4 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 1 + size / 4);
-   }
-   PUSH_DATA(push, nvc0_prim_gl(info->mode));
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
-   nouveau_pushbuf_data(push,
-                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
+      size = 4;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
+   }
+
+   /* If the stride is not the natural stride, we have to stick a separate
+    * push data reference for each draw. Otherwise it can all go in as one.
+    * Of course there is a maximum packet size, so we have to break things up
+    * along those borders as well.
+    */
+   while (count) {
+      unsigned draws = count, pushes, i;
+      if (info->indirect_stride == size * 4) {
+         draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size);
+         pushes = 1;
+      } else {
+         draws = MIN2(draws, 32);
+         pushes = draws;
+      }
+
+      nouveau_pushbuf_space(push, 16, 0, pushes + !!buf_count);
+      PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
+      if (buf_count)
+         PUSH_REFN(push, buf_count->bo, NOUVEAU_BO_RD | buf_count->domain);
+      PUSH_DATA(push,
+                NVC0_FIFO_PKHDR_1I(0, macro, 3 + !!buf_count + draws * size));
+      PUSH_DATA(push, nvc0_prim_gl(info->mode));
+      PUSH_DATA(push, drawid);
+      PUSH_DATA(push, draws);
+      if (buf_count) {
+         nouveau_pushbuf_data(push,
+                              buf_count->bo,
+                              buf_count->offset + info->indirect_params_offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | 4);
+      }
+      if (pushes == 1) {
+         nouveau_pushbuf_data(push,
+                              buf->bo, offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws));
+         offset += draws * info->indirect_stride;
+      } else {
+         for (i = 0; i < pushes; i++) {
+            nouveau_pushbuf_data(push,
+                                 buf->bo, offset,
+                                 NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4));
+            offset += info->indirect_stride;
+         }
+      }
+      count -= draws;
+      drawid += draws;
+   }
 }
 
 static inline void
@@ -864,7 +920,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nvc0->vb_elt_first = info->min_index + info->index_bias;
@@ -901,29 +957,25 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 
+   if (nvc0->vertprog->vp.need_draw_parameters) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      if (!info->indirect) {
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, info->index_bias);
+         PUSH_DATA (push, info->start_instance);
+         PUSH_DATA (push, info->drawid);
+      }
+   }
+
    push->kick_notify = nvc0_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
-      uint32_t valid = nvc0->constbuf_valid[s];
-
-      while (valid && !nvc0->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nvc0->constbuf[s][i].user)
-            continue;
-
-         res = nvc0->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = true;
-      }
+      if (nvc0->constbuf_coherent[s])
+         nvc0->cb_dirty = true;
    }
 
    if (nvc0->cb_dirty) {
@@ -932,14 +984,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 5; ++s) {
+      if (!nvc0->textures_coherent[s])
+         continue;
+
       for (int i = 0; i < nvc0->num_textures[s]; ++i) {
          struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
-         struct pipe_resource *res;
-         if (!tic)
-            continue;
-         res = nvc0->textures[s][i]->texture;
-         if (res->target != PIPE_BUFFER ||
-             !(res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+         if (!(nvc0->textures_coherent[s] & (1 << i)))
             continue;
 
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -965,12 +1015,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nvc0->num_vtxbufs && !nvc0->base.vbo_dirty; ++i) {
-      if (!nvc0->vtxbuf[i].buffer)
-         continue;
-      if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = true;
-   }
+   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
index 48ffac1b715..a9fd1d20942 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -26,6 +26,24 @@
 #include "util/u_format.h"
 
 static void
+nvc0_decoder_begin_frame(struct pipe_video_codec *decoder,
+                         struct pipe_video_buffer *target,
+                         struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = ++dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(dec);
+   assert(target);
+   assert(target->buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp_begin(dec, comm_seq);
+
+   assert(ret == 2);
+}
+
+static void
 nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               struct pipe_video_buffer *video_target,
                               struct pipe_picture_desc *picture,
@@ -34,8 +52,24 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               const unsigned *num_bytes)
 {
    struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(decoder);
+
+   ret = nvc0_decoder_bsp_next(dec, comm_seq, num_buffers, data, num_bytes);
+
+   assert(ret == 2);
+}
+
+static void
+nvc0_decoder_end_frame(struct pipe_video_codec *decoder,
+                       struct pipe_video_buffer *video_target,
+                       struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
    struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
-   uint32_t comm_seq = ++dec->fence_seq;
+   uint32_t comm_seq = dec->fence_seq;
    union pipe_desc desc;
 
    unsigned vp_caps, is_ref, ret;
@@ -43,11 +77,7 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
 
    desc.base = picture;
 
-   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
-
-   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes,
-                          &vp_caps, &is_ref, refs);
+   ret = nvc0_decoder_bsp_end(dec, desc, target, comm_seq, &vp_caps, &is_ref, refs);
 
    /* did we decode bitstream correctly? */
    assert(ret == 2);
@@ -164,14 +194,19 @@ nvc0_create_decoder(struct pipe_context *context,
    PUSH_DATA (push[2], dec->ppp->handle);
 
    dec->base.context = context;
+   dec->base.begin_frame = nvc0_decoder_begin_frame;
    dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+   dec->base.end_frame = nvc0_decoder_end_frame;
 
    for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
    if (!ret) {
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0x100, dec->inter_bo[0]->size, &cfg,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
index 9ee0280f8ea..cf3c942355b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
@@ -30,12 +30,18 @@
 #include "util/u_video.h"
 
 extern unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16]);
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq);
+
+extern unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes);
+
+extern unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target,
+                     unsigned comm_seq, unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16]);
 
 extern void
 nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index 4392f62c530..c53f946a762 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -32,40 +32,34 @@ static void dump_comm_bsp(struct comm *comm)
 #endif
 
 unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16])
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq)
 {
-   struct nouveau_pushbuf *push = dec->pushbuf[0];
-   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
-   uint32_t bsp_addr, comm_addr, inter_addr;
-   uint32_t slice_size, bucket_size, ring_size, bsp_size;
-   uint32_t caps, i;
-   int ret;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
-   struct nouveau_pushbuf_refn bo_refs[] = {
-      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
-      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
-#if NOUVEAU_VP3_DEBUG_FENCE
-      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
-#endif
-      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
-   };
-   int num_refs = ARRAY_SIZE(bo_refs);
+   unsigned ret = 0;
 
-   if (!dec->bitplane_bo)
-      num_refs--;
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
 
-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
+   nouveau_vp3_bsp_begin(dec);
+
+   return 2;
+}
 
-   bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE;
+unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes)
+{
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   uint32_t bsp_size = 0;
+   uint32_t i = 0;
+   unsigned ret = 0;
+
+   bsp_size = dec->bsp_ptr - (char *)bsp_bo->map;
    for (i = 0; i < num_buffers; i++)
       bsp_size += num_bytes[i];
    bsp_size += 256; /* the 4 end markers */
@@ -81,14 +75,29 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
+      /* Preserve previous buffer. */
+      /* TODO: offload this copy to the GPU, as otherwise we're reading and
+       * writing to VRAM. */
+      memcpy(tmp_bo->map, bsp_bo->map, bsp_bo->size);
+
+      /* update position to current chunk */
+      dec->bsp_ptr = tmp_bo->map + (dec->bsp_ptr - (char *)bsp_bo->map);
+
       nouveau_bo_ref(NULL, &bsp_bo);
-      bo_refs[0].bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
+      dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
    }
 
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
@@ -98,24 +107,61 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       cfg.nvc0.tile_mode = 0x10;
       cfg.nvc0.memtype = 0xfe;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
       nouveau_bo_ref(NULL, &inter_bo);
-      bo_refs[1].bo = dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
+      dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
    }
 
-   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
-   if (ret) {
-      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
-      return -1;
-   }
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+
+   return 2;
+}
+
+
+unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                     unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = ARRAY_SIZE(bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 4ea8ca3cfa2..79abe78b77a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -68,6 +68,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_SW(m) 7, (m)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
 static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog.c b/src/gallium/drivers/r300/compiler/r500_fragprog.c
index 88aad8a054f..4c415afcb05 100644
--- a/src/gallium/drivers/r300/compiler/r500_fragprog.c
+++ b/src/gallium/drivers/r300/compiler/r500_fragprog.c
@@ -384,7 +384,7 @@ void r500FragmentProgramDump(struct radeon_compiler *c, void *user)
     case R500_INST_TYPE_OUT: str = "OUT"; break;
     case R500_INST_TYPE_FC: str = "FC"; break;
     case R500_INST_TYPE_TEX: str = "TEX"; break;
-    };
+    }
     fprintf(stderr,"%s %s %s %s %s ", str,
 	    inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
 	    inst & R500_INST_LAST ? "LAST" : "",
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index b393769c861..82ba0435118 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -421,8 +421,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.create_video_codec = vl_create_decoder;
     r300->context.create_video_buffer = vl_video_buffer_create;
 
-    r300->uploader = u_upload_create(&r300->context, 256 * 1024, 4,
-                                     PIPE_BIND_CUSTOM);
+    r300->uploader = u_upload_create(&r300->context, 256 * 1024,
+                                     PIPE_BIND_CUSTOM, PIPE_USAGE_STREAM);
 
     r300->blitter = util_blitter_create(&r300->context);
     if (r300->blitter == NULL)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index b482fa140ed..7eda6753d0d 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -1010,7 +1010,7 @@ static void r300_render_draw_elements(struct vbuf_render* render,
     CS_LOCALS(r300);
     DBG(r300, DBG_DRAW, "r300: render_draw_elements (count: %d)\n", count);
 
-    u_upload_data(r300->uploader, 0, count * 2, indices,
+    u_upload_data(r300->uploader, 0, count * 2, 4, indices,
                   &index_buffer_offset, &index_buffer);
     if (!index_buffer) {
         return;
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
index caeeec05909..7221211deea 100644
--- a/src/gallium/drivers/r300/r300_render_translate.c
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -37,7 +37,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     switch (*index_size) {
     case 1:
         *out_buffer = NULL;
-        u_upload_alloc(r300->uploader, 0, count * 2,
+        u_upload_alloc(r300->uploader, 0, count * 2, 4,
                        &out_offset, out_buffer, &ptr);
 
         util_shorten_ubyte_elts_to_userptr(
@@ -51,7 +51,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 2:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 2,
+            u_upload_alloc(r300->uploader, 0, count * 2, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_ushort_elts_to_userptr(&r300->context, ib,
@@ -65,7 +65,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 4:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 4,
+            u_upload_alloc(r300->uploader, 0, count * 4, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_uint_elts_to_userptr(&r300->context, ib,
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 606e25f915b..d1b59ab4345 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -183,6 +183,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SAMPLE_SHADING:
         case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -200,6 +202,11 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
         case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
             return 0;
 
         /* SWTCL-only features. */
@@ -304,6 +311,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
@@ -362,6 +370,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 737a6f5e4f8..42c8e3a0fc5 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -42,7 +42,7 @@ void r300_upload_index_buffer(struct r300_context *r300,
     *index_buffer = NULL;
 
     u_upload_data(r300->uploader,
-                  0, count * index_size,
+                  0, count * index_size, 4,
                   ptr + (*start * index_size),
                   &index_offset,
                   index_buffer);
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index da472f4d7f4..741e263e7ed 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -52,7 +52,6 @@ enum r300_rs_col_write_type {
 
 static void r300_draw_emit_attrib(struct r300_context* r300,
                                   enum attrib_emit emit,
-                                  enum interp_mode interp,
                                   int index)
 {
     struct r300_vertex_shader* vs = r300->vs_state.state;
@@ -62,7 +61,7 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
     output = draw_find_shader_output(r300->draw,
                                      info->output_semantic_name[index],
                                      info->output_semantic_index[index]);
-    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, output);
 }
 
 static void r300_draw_emit_all_attribs(struct r300_context* r300)
@@ -73,31 +72,27 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
 
     /* Position. */
     if (vs_outputs->pos != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->pos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->pos);
     } else {
         assert(0);
     }
 
     /* Point size. */
     if (vs_outputs->psize != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
-                              vs_outputs->psize);
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, vs_outputs->psize);
     }
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->color[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->color[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->color[i]);
         }
     }
 
     /* Back-face colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->bcolor[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->bcolor[i]);
         }
     }
 
@@ -108,16 +103,14 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
         if (vs_outputs->generic[i] != ATTR_UNUSED &&
             !(r300->sprite_coord_enable & (1 << i))) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                                  vs_outputs->generic[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]);
             gen_count++;
         }
     }
 
     /* Fog coordinates. */
     if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->fog);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->fog);
         gen_count++;
     }
 
@@ -125,8 +118,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) {
         DBG(r300, DBG_SWTCL, "draw_emit_attrib: WPOS, index: %i\n",
             vs_outputs->wpos);
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->wpos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->wpos);
     }
 }
 
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index d83eb17c280..20945ece155 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -600,7 +600,7 @@ static void evergreen_launch_grid(
                            ctx->screen->has_compressed_msaa_texturing);
                 bc->type = TGSI_PROCESSOR_COMPUTE;
                 bc->isa = ctx->isa;
-                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
+                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
 
                 if (dump && !sb_disasm) {
                         r600_bytecode_disasm(bc);
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 1aee7dd2da8..9dfb84965cf 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1956,7 +1956,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-						    ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags);
+						    DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
 			radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
 						    pkt_flags);
 		}
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 1cc30317ba5..8b91372f3ae 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -726,7 +726,7 @@ static void tex_fetch_args(
 		 * That operand should be passed as a float value in the args array
 		 * right after the coord vector. After packing it's not used anymore,
 		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 	}
 
 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
@@ -915,14 +915,17 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump)
+	unsigned dump,
+	struct pipe_debug_callback *debug)
 {
 	unsigned r;
 	struct radeon_shader_binary binary;
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
+	if (dump)
+		LLVMDumpModule(mod);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
index 9b5304d9fcb..f570b739fbe 100644
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@@ -7,6 +7,7 @@
 #include "radeon/radeon_llvm.h"
 #include <llvm-c/Core.h>
 
+struct pipe_debug_callback;
 struct r600_bytecode;
 struct r600_shader_ctx;
 struct radeon_llvm_context;
@@ -22,7 +23,8 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump);
+	unsigned dump,
+	struct pipe_debug_callback *debug);
 
 unsigned r600_create_shader(struct r600_bytecode *bc,
 		const struct radeon_shader_binary *binary,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 17006f70601..e61d9286542 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -348,6 +348,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -522,6 +529,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		/* due to a bug in the shader compiler, some loops hang
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 31f2a729494..0e4dd16525b 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -946,7 +946,6 @@ static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
-#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
 static inline unsigned r600_pack_float_12p4(float x)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index d411b0be50e..df40f94bdcf 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -162,7 +162,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_shader_selector *sel = shader->selector;
 	int r;
-	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&rctx->screen->b,
+					 tgsi_get_processor_type(sel->tokens));
 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
 	unsigned export_shader;
@@ -394,7 +395,7 @@ static int tgsi_last_instruction(unsigned writemask)
 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
-	int j;
+	unsigned j;
 
 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
@@ -1166,7 +1167,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
 */
 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int num_baryc;
 	struct tgsi_parse_context parse;
 
@@ -1585,7 +1586,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -1854,7 +1855,7 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -2784,7 +2785,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
 
 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int stride, outer_comps, inner_comps;
 	int tessinner_idx = -1, tessouter_idx = -1;
 	int r;
@@ -3238,7 +3239,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (use_llvm) {
 		struct radeon_llvm_context radeon_llvm_ctx;
 		LLVMModuleRef mod;
-		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
+		bool dump = r600_can_dump_shader(&rscreen->b,
+						 tgsi_get_processor_type(tokens));
 		boolean use_kill = false;
 
 		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
@@ -3259,7 +3261,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
 		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
 
-		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
+		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
+				      dump, &rctx->b.debug)) {
 			radeon_llvm_dispose(&radeon_llvm_ctx);
 			use_llvm = 0;
 			fprintf(stderr, "R600 LLVM backend failed to compile "
@@ -4424,7 +4427,7 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ctx->inst_info->op;
 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
+				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
 			}
 			alu.dst.sel = t1;
 			alu.dst.chan = i;
@@ -4791,7 +4794,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	{
 		int chan;
 		int sel;
-		int i;
+		unsigned i;
 
 		if (ctx->bc->chip_class == CAYMAN) {
 			for (i = 0; i < 3; i++) {
@@ -7925,7 +7928,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = 2^floor(src); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8054,7 +8057,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = floor(log2(|src|)); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8781,7 +8784,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
 
 static int tgsi_endloop(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 
 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
 
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 43b80742cb5..f60e30486a2 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-					       ALIGN_DIVUP(cb->buffer_size, 256));
+					       DIV_ROUND_UP(cb->buffer_size, 256));
 			radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
 		}
 
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ca589fa7759..c3346f29811 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1106,10 +1106,10 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
 				tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]);
 			}
 
-			u_upload_data(rctx->b.uploader, 0, size, tmpPtr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer);
 			free(tmpPtr);
 		} else {
-			u_upload_data(rctx->b.uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer);
 		}
 		/* account it in gtt */
 		rctx->b.gtt += input->buffer_size;
@@ -1732,7 +1732,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 				}
 			}
 
-			u_upload_alloc(rctx->b.uploader, start, count * 2,
+			u_upload_alloc(rctx->b.uploader, start, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 
 			util_shorten_ubyte_elts_to_userptr(
@@ -1753,7 +1753,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
 						 info.instance_count > 1 ||
 						 info.count*ib.index_size > 20)) {
-			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size,
+			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256,
 				      ib.user_buffer, &ib.offset, &ib.buffer);
 			ib.user_buffer = NULL;
 		}
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 18925277d2d..484f5c8d5b7 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -298,7 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 			struct r600_resource *staging = NULL;
 
 			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-				       &offset, (struct pipe_resource**)&staging, (void**)&data);
+				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);
 
 			if (staging) {
 				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 9a5e9878176..52c365e81d0 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -85,7 +85,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	/* Upload vertices. The hw rectangle has only 3 vertices,
 	 * I guess the 4th one is derived from the first 3.
 	 * The vertex specification should match u_blitter's vertex element state. */
-	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
 	if (!buf)
 		return;
 
@@ -227,6 +227,17 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 	return PIPE_UNKNOWN_CONTEXT_RESET;
 }
 
+static void r600_set_debug_callback(struct pipe_context *ctx,
+				    const struct pipe_debug_callback *cb)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	if (cb)
+		rctx->debug = *cb;
+	else
+		memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -252,6 +263,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->b.transfer_inline_write = u_default_transfer_inline_write;
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
+	rctx->b.set_debug_callback = r600_set_debug_callback;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 		rctx->b.get_device_reset_status = r600_get_reset_status;
@@ -272,9 +284,9 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->allocator_so_filled_size)
 		return false;
 
-	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
 					PIPE_BIND_INDEX_BUFFER |
-					PIPE_BIND_CONSTANT_BUFFER);
+					PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
 	if (!rctx->uploader)
 		return false;
 
@@ -999,13 +1011,9 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 }
 
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens)
+			  unsigned processor)
 {
-	/* Compute shader don't have tgsi_tokens */
-	if (!tokens)
-		return (rscreen->debug_flags & DBG_CS) != 0;
-
-	switch (tgsi_get_processor_type(tokens)) {
+	switch (processor) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
 	case TGSI_PROCESSOR_TESS_CTRL:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c3933b1da98..68b50a9fb0f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -440,6 +440,8 @@ struct r600_common_context {
 	 * the GPU addresses are updated. */
 	struct list_head		texture_buffers;
 
+	struct pipe_debug_callback	debug;
+
 	/* Copy one resource to another using async DMA. */
 	void (*dma_copy)(struct pipe_context *ctx,
 			 struct pipe_resource *dst,
@@ -514,7 +516,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 void r600_common_context_cleanup(struct r600_common_context *rctx);
 void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens);
+			  unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
 			      unsigned offset, unsigned size, unsigned value,
 			      bool is_framebuffer);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 61ed9402122..3d0987624a6 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -23,12 +23,15 @@
  * Authors: Tom Stellard <[email protected]>
  *
  */
+
 #include "radeon_llvm_emit.h"
 #include "radeon_elf_util.h"
 #include "c11/threads.h"
 #include "gallivm/lp_bld_misc.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
 
 #include <llvm-c/Target.h>
 #include <llvm-c/TargetMachine.h>
@@ -123,16 +126,44 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple)
 	return target;
 }
 
+struct radeon_llvm_diagnostics {
+	struct pipe_debug_callback *debug;
+	unsigned retval;
+};
+
 static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
 {
-	if (LLVMGetDiagInfoSeverity(di) == LLVMDSError) {
-		unsigned int *diagnosticflag = (unsigned int *)context;
-		char *diaginfo_message = LLVMGetDiagInfoDescription(di);
+	struct radeon_llvm_diagnostics *diag = (struct radeon_llvm_diagnostics *)context;
+	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+	char *description = LLVMGetDiagInfoDescription(di);
+	const char *severity_str = NULL;
+
+	switch (severity) {
+	case LLVMDSError:
+		severity_str = "error";
+		break;
+	case LLVMDSWarning:
+		severity_str = "warning";
+		break;
+	case LLVMDSRemark:
+		severity_str = "remark";
+		break;
+	case LLVMDSNote:
+		severity_str = "note";
+		break;
+	default:
+		severity_str = "unknown";
+	}
+
+	pipe_debug_message(diag->debug, SHADER_INFO,
+			   "LLVM diagnostic (%s): %s", severity_str, description);
 
-		*diagnosticflag = 1;
-		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", diaginfo_message);
-		LLVMDisposeMessage(diaginfo_message);
+	if (severity == LLVMDSError) {
+		diag->retval = 1;
+		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
 	}
+
+	LLVMDisposeMessage(description);
 }
 
 /**
@@ -141,22 +172,25 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm)
+			     const char *gpu_family,
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug)
 {
-
+	struct radeon_llvm_diagnostics diag;
 	char cpu[CPU_STRING_LEN];
 	char fs[FS_STRING_LEN];
 	char *err;
 	bool dispose_tm = false;
 	LLVMContextRef llvm_ctx;
-	unsigned rval = 0;
 	LLVMMemoryBufferRef out_buffer;
 	unsigned buffer_size;
 	const char *buffer_data;
 	char triple[TRIPLE_STRING_LEN];
 	LLVMBool mem_err;
 
+	diag.debug = debug;
+	diag.retval = 0;
+
 	if (!tm) {
 		strncpy(triple, "r600--", TRIPLE_STRING_LEN);
 		LLVMTargetRef target = radeon_llvm_get_r600_target(triple);
@@ -165,20 +199,17 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump_asm)
-			strncpy(fs, "+DumpCode", FS_STRING_LEN);
+		strncpy(fs, "+DumpCode", FS_STRING_LEN);
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump_ir)
-		LLVMDumpModule(M);
+
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
-	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &rval);
-	rval = 0;
+	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &diag);
 
 	/* Compile IR*/
 	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err,
@@ -187,15 +218,13 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Process Errors/Warnings */
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
+		pipe_debug_message(debug, SHADER_INFO,
+				   "LLVM emit error: %s", err);
 		FREE(err);
-		rval = 1;
+		diag.retval = 1;
 		goto out;
 	}
 
-	if (0 != rval) {
-		fprintf(stderr, "%s: Processing Diag Flag\n", __FUNCTION__);
-	}
-
 	/* Extract Shader Code*/
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
@@ -209,5 +238,7 @@ out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
-	return rval;
+	if (diag.retval != 0)
+		pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
+	return diag.retval;
 }
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index e20aed94c6b..45f05a9e0e1 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -31,6 +31,7 @@
 #include <llvm-c/TargetMachine.h>
 #include <stdbool.h>
 
+struct pipe_debug_callback;
 struct radeon_shader_binary;
 
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
@@ -38,7 +39,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm);
+			     const char *gpu_family,
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 2de237b4716..105a1b2a878 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -196,7 +196,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
 			lbpe;
 		cs->buf[cs->cdw++] = y << 16; /* | x */
-		cs->buf[cs->cdw++] = 0; /* z */;
+		cs->buf[cs->cdw++] = 0; /* z */
 		cs->buf[cs->cdw++] = addr & 0xfffffffc;
 		cs->buf[cs->cdw++] = addr >> 32;
 		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 47a74eea0e0..5a08cbfb198 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,9 +67,9 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 				program->shader.binary.global_symbol_offsets[i];
 		unsigned scratch_bytes_needed;
 
-		si_shader_binary_read_config(sctx->screen,
-						&program->shader, offset);
-		scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+		si_shader_binary_read_config(&program->shader.binary,
+					     &program->shader.config, offset);
+		scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
 	}
 
@@ -87,7 +87,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 	 * to the maximum bytes needed, so it can compute the stride
 	 * correctly.
 	 */
-	program->shader.scratch_bytes_per_wave = scratch_bytes;
+	program->shader.config.scratch_bytes_per_wave = scratch_bytes;
 
 	/* Patch the shader with the scratch buffer address. */
 	si_shader_apply_scratch_relocs(sctx,
@@ -122,8 +122,12 @@ static void *si_create_compute_state(
 	        for (i = 0; i < program->num_kernels; i++) {
 		        LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                         code, header->num_bytes);
-			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
-					mod);
+			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
+					&program->kernels[i].config, sctx->tm,
+					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_dump(sctx->screen, &program->kernels[i],
+				       &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
 		}
 	}
@@ -136,7 +140,11 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader);
+	si_shader_binary_read_config(&program->shader.binary,
+				     &program->shader.config, 0);
+	si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
+		       TGSI_PROCESSOR_COMPUTE);
+	si_shader_binary_upload(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
@@ -259,7 +267,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(sctx->screen, shader, pc);
+	si_shader_binary_read_config(&shader->binary, &shader->config, pc);
 #endif
 
 	/* Upload the kernel arguments */
@@ -280,12 +288,12 @@ static void si_launch_grid(
 
 	memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
 
-	if (shader->scratch_bytes_per_wave > 0) {
+	if (shader->config.scratch_bytes_per_wave > 0) {
 
 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
 		            "Total Scratch: %u bytes\n", num_waves_for_scratch,
-			    shader->scratch_bytes_per_wave,
-			    shader->scratch_bytes_per_wave *
+			    shader->config.scratch_bytes_per_wave,
+			    shader->config.scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
@@ -312,7 +320,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
 		S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
 
 	si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
 	si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
@@ -360,9 +368,9 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
 
-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1);
 
-	lds_blocks = shader->lds_size;
+	lds_blocks = shader->config.lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
 	 * blocks of 256 bytes, so if there are 4 bytes lds allocated in
 	 * the shader and 4 bytes allocated by the state tracker, then
@@ -376,10 +384,10 @@ static void si_launch_grid(
 
 	assert(lds_blocks <= 0xFF);
 
-	shader->rsrc2 &= C_00B84C_LDS_SIZE;
-	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+	shader->config.rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->config.rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
 
-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
 
 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
@@ -401,7 +409,7 @@ static void si_launch_grid(
 		 * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
 		 */
 		S_00B860_WAVES(num_waves_for_scratch)
-		| S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
+		| S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
 		;
 
 	si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index b3719dea252..d157a9ffb00 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!desc->list_dirty)
 		return true;
 
-	u_upload_alloc(sctx->b.uploader, 0, list_size,
+	u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
 		       &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, &ptr);
 	if (!desc->buffer)
@@ -391,7 +391,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	 * directly through a staging buffer and don't go through
 	 * the fine-grained upload path.
 	 */
-	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+	u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
@@ -465,7 +465,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
 {
 	void *tmp;
 
-	u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
+	u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
 		       (struct pipe_resource**)rbuffer, &tmp);
 	if (rbuffer)
 		util_memcpy_cpu_to_le32(tmp, ptr, size);
@@ -1011,19 +1011,19 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(&sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
 					 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
-				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+				    SI_SGPR_SAMPLER_VIEWS, 8, SI_NUM_SAMPLER_VIEWS);
 		si_init_descriptors(&sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
+				    SI_SGPR_SAMPLER_STATES, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ac13407e2a1..c2ca94339ac 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -340,6 +340,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -512,6 +519,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		return 1;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e98784d51b..1db3e484915 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -82,11 +82,11 @@ struct si_shader_context
 	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
-	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
+	LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
-	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
+	LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS];
+	LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
@@ -394,7 +394,7 @@ static void declare_input_vs(
 	LLVMValueRef input;
 
 	/* Load the T list */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 
 	t_offset = lp_build_const_int32(gallivm, input_index);
 
@@ -1065,7 +1065,7 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld,
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
 
@@ -1233,13 +1233,13 @@ static LLVMValueRef fetch_constant(
 	}
 
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
-		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 		LLVMValueRef index;
 		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
 						   reg->Dimension.Index);
 		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
 	} else
-		bufp = si_shader_ctx->const_resource[buf];
+		bufp = si_shader_ctx->const_buffers[buf];
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -1260,7 +1260,7 @@ static LLVMValueRef fetch_constant(
 		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
 				     lp_build_const_int32(base->gallivm, idx * 4));
 
-		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf],
 				   addr2, bld_base->base.elem_type);
 
 		result = radeon_llvm_emit_fetch_double(bld_base,
@@ -1302,18 +1302,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8) {
+		if (cbuf >= 0 && cbuf < 8)
 			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
-
-			if (compressed)
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
-			else
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
-
-			si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
-		}
 	}
 
 	/* Set COMPR flag */
@@ -1333,34 +1323,19 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 						    LLVMInt32TypeInContext(base->gallivm->context),
 						    pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-			args[chan + 7] = args[chan + 5] =
+			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
 						 packed,
 						 LLVMFloatTypeInContext(base->gallivm->context),
 						 "");
+			args[chan + 7] = base->undef;
 		}
 	} else
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
 }
 
-/* Load from output pointers and initialize arguments for the shader export intrinsic */
-static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef *out_ptr,
-					  unsigned target,
-					  LLVMValueRef *args)
-{
-	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef values[4];
-	int i;
-
-	for (i = 0; i < 4; i++)
-		values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
-
-	si_llvm_init_export_args(bld_base, values, target, args);
-}
-
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
-			  LLVMValueRef alpha_ptr)
+			  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1372,8 +1347,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 		LLVMValueRef alpha_pass =
 			lp_build_cmp(&bld_base->base,
 				     si_shader_ctx->shader->key.ps.alpha_func,
-				     LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
-				     alpha_ref);
+				     alpha, alpha_ref);
 		LLVMValueRef arg =
 			lp_build_select(&bld_base->base,
 					alpha_pass,
@@ -1390,16 +1364,14 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
 	}
-
-	si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
 }
 
-static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef alpha_ptr)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
+						  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef coverage, alpha;
+	LLVMValueRef coverage;
 
 	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
 	coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1417,9 +1389,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				 lp_build_const_float(gallivm,
 					1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 
-	alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
-	alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
-	LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
+	return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
 }
 
 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
@@ -1432,7 +1402,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 	unsigned chan;
 	unsigned const_chan;
 	LLVMValueRef base_elt;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
 
@@ -2112,197 +2082,193 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	FREE(outputs);
 }
 
-static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
+			   LLVMValueRef depth, LLVMValueRef stencil,
+			   LLVMValueRef samplemask)
 {
-	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
-	struct si_shader * shader = si_shader_ctx->shader;
-	struct lp_build_context * base = &bld_base->base;
-	struct lp_build_context * uint = &bld_base->uint_bld;
-	struct tgsi_shader_info *info = &shader->selector->info;
-	LLVMBuilderRef builder = base->gallivm->builder;
+	struct si_screen *sscreen = si_shader_context(bld_base)->screen;
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
 	LLVMValueRef args[9];
-	LLVMValueRef last_args[9] = { 0 };
-	int depth_index = -1, stencil_index = -1, samplemask_index = -1;
-	int i;
+	unsigned mask = 0;
 
-	for (i = 0; i < info->num_outputs; i++) {
-		unsigned semantic_name = info->output_semantic_name[i];
-		unsigned semantic_index = info->output_semantic_index[i];
-		unsigned target;
-		LLVMValueRef alpha_ptr;
+	assert(depth || stencil || samplemask);
 
-		/* Select the correct target */
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_POSITION:
-			depth_index = i;
-			continue;
-		case TGSI_SEMANTIC_STENCIL:
-			stencil_index = i;
-			continue;
-		case TGSI_SEMANTIC_SAMPLEMASK:
-			samplemask_index = i;
-			continue;
-		case TGSI_SEMANTIC_COLOR:
-			target = V_008DFC_SQ_EXP_MRT + semantic_index;
-			alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
 
-			if (si_shader_ctx->shader->key.ps.clamp_color) {
-				for (int j = 0; j < 4; j++) {
-					LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
-					LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+	/* Specify the target we are exporting */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
 
-					result = radeon_llvm_saturate(bld_base, result);
-					LLVMBuildStore(builder, result, ptr);
-				}
-			}
+	args[4] = uint->zero; /* COMP flag */
+	args[5] = base->undef; /* R, depth */
+	args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
+	args[7] = base->undef; /* B, sample mask */
+	args[8] = base->undef; /* A, alpha to mask */
 
-			if (si_shader_ctx->shader->key.ps.alpha_to_one)
-				LLVMBuildStore(base->gallivm->builder,
-					       base->one, alpha_ptr);
+	if (depth) {
+		args[5] = depth;
+		mask |= 0x1;
+	}
 
-			if (semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
-				si_alpha_test(bld_base, alpha_ptr);
+	if (stencil) {
+		args[6] = stencil;
+		mask |= 0x2;
+	}
 
-			if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-				si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
+	if (samplemask) {
+		args[7] = samplemask;
+		mask |= 0x4;
+	}
 
-			break;
-		default:
-			target = 0;
-			fprintf(stderr,
-				"Warning: SI unhandled fs output type:%d\n",
-				semantic_name);
-		}
+	/* SI (except OLAND) has a bug that it only looks
+	 * at the X writemask component. */
+	if (sscreen->b.chip_class == SI &&
+	    sscreen->b.family != CHIP_OLAND)
+		mask |= 0x1;
 
-		si_llvm_init_export_args_load(bld_base,
-					      si_shader_ctx->radeon_bld.soa.outputs[i],
-					      target, args);
-
-		if (semantic_name == TGSI_SEMANTIC_COLOR) {
-			/* If there is an export instruction waiting to be emitted, do so now. */
-			if (last_args[0]) {
-				lp_build_intrinsic(base->gallivm->builder,
-						   "llvm.SI.export",
-						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9, 0);
-			}
+	/* Specify which components to enable */
+	args[0] = lp_build_const_int32(base->gallivm, mask);
 
-			/* This instruction will be emitted at the end of the shader. */
-			memcpy(last_args, args, sizeof(args));
-
-			/* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
-			if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-			    semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-				for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-					si_llvm_init_export_args_load(bld_base,
-								      si_shader_ctx->radeon_bld.soa.outputs[i],
-								      V_008DFC_SQ_EXP_MRT + c, args);
-					lp_build_intrinsic(base->gallivm->builder,
-							   "llvm.SI.export",
-							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9, 0);
-				}
-			}
-		} else {
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
+
+static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
+				LLVMValueRef *color, unsigned index,
+				bool is_last)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef args[9];
+	int i;
+
+	/* Clamp color */
+	if (si_shader_ctx->shader->key.ps.clamp_color)
+		for (i = 0; i < 4; i++)
+			color[i] = radeon_llvm_saturate(bld_base, color[i]);
+
+	/* Alpha to one */
+	if (si_shader_ctx->shader->key.ps.alpha_to_one)
+		color[3] = base->one;
+
+	/* Alpha test */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+		si_alpha_test(bld_base, color[3]);
+
+	/* Line & polygon smoothing */
+	if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
+		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+
+	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
+		for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+			si_llvm_init_export_args(bld_base, color,
+						 V_008DFC_SQ_EXP_MRT + c, args);
+			lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
 					   args, 9, 0);
 		}
 	}
 
-	if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
-		LLVMValueRef out_ptr;
-		unsigned mask = 0;
-
-		/* Specify the target we are exporting */
-		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
-
-		args[5] = base->zero; /* R, depth */
-		args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
-		args[7] = base->zero; /* B, sample mask */
-		args[8] = base->zero; /* A, alpha to mask */
-
-		if (depth_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
-			args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x1;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
-		}
-
-		if (stencil_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
-			args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x2;
-			si_shader_ctx->shader->db_shader_control |=
-				S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
-		}
-
-		if (samplemask_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
-			args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x4;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
-		}
-
-		/* SI (except OLAND) has a bug that it only looks
-		 * at the X writemask component. */
-		if (si_shader_ctx->screen->b.chip_class == SI &&
-		    si_shader_ctx->screen->b.family != CHIP_OLAND)
-			mask |= 0x1;
+	/* Export */
+	si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+				 args);
+	if (is_last) {
+		args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+		args[2] = bld_base->uint_bld.one; /* DONE bit */
+	}
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
 
-		if (samplemask_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
-		else if (stencil_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
-		else
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
+static void si_export_null(struct lp_build_tgsi_context *bld_base)
+{
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
+	LLVMValueRef args[9];
 
-		/* Specify which components to enable */
-		args[0] = lp_build_const_int32(base->gallivm, mask);
+	args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+	args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+	args[5] = uint->undef; /* R */
+	args[6] = uint->undef; /* G */
+	args[7] = uint->undef; /* B */
+	args[8] = uint->undef; /* A */
+
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
 
-		args[1] =
-		args[2] =
-		args[4] = uint->zero;
+static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader * shader = si_shader_ctx->shader;
+	struct lp_build_context * base = &bld_base->base;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	LLVMBuilderRef builder = base->gallivm->builder;
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	int last_color_export = -1;
+	int i;
 
-		if (last_args[0])
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9, 0);
-		else
-			memcpy(last_args, args, sizeof(args));
+	/* If there are no outputs, add a dummy export. */
+	if (!info->num_outputs) {
+		si_export_null(bld_base);
+		return;
 	}
 
-	if (!last_args[0]) {
-		/* Specify which components to enable */
-		last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
+	/* Determine the last export. If MRTZ is present, it's always last.
+	 * Otherwise, find the last color export.
+	 */
+	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask)
+		for (i = 0; i < info->num_outputs; i++)
+			if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR)
+				last_color_export = i;
 
-		/* Specify the target we are exporting */
-		last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+	for (i = 0; i < info->num_outputs; i++) {
+		unsigned semantic_name = info->output_semantic_name[i];
+		unsigned semantic_index = info->output_semantic_index[i];
+		unsigned j;
+		LLVMValueRef color[4] = {};
 
-		/* Set COMPR flag to zero to export data as 32-bit */
-		last_args[4] = uint->zero;
+		/* Select the correct target */
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_POSITION:
+			depth = LLVMBuildLoad(builder,
+					      si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
+			break;
+		case TGSI_SEMANTIC_STENCIL:
+			stencil = LLVMBuildLoad(builder,
+						si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
+			break;
+		case TGSI_SEMANTIC_SAMPLEMASK:
+			samplemask = LLVMBuildLoad(builder,
+						   si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			for (j = 0; j < 4; j++)
+				color[j] = LLVMBuildLoad(builder,
+							 si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
 
-		/* dummy bits */
-		last_args[5]= uint->zero;
-		last_args[6]= uint->zero;
-		last_args[7]= uint->zero;
-		last_args[8]= uint->zero;
+			si_export_mrt_color(bld_base, color, semantic_index,
+					    last_color_export == i);
+			break;
+		default:
+			fprintf(stderr,
+				"Warning: SI unhandled fs output type:%d\n",
+				semantic_name);
+		}
 	}
 
-	/* Specify whether the EXEC mask represents the valid mask */
-	last_args[1] = uint->one;
-
-	/* Specify that this is the last export */
-	last_args[2] = lp_build_const_int32(base->gallivm, 1);
-
-	lp_build_intrinsic(base->gallivm->builder,
-			   "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9, 0);
+	if (depth || stencil || samplemask)
+		si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
@@ -2390,10 +2356,10 @@ static void tex_fetch_ptrs(
 
 		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
 
-		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 		*res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index);
 
-		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 		*samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index);
 
 		if (target == TGSI_TEXTURE_2D_MSAA ||
@@ -2401,13 +2367,13 @@ static void tex_fetch_ptrs(
 			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
 						 lp_build_const_int32(gallivm,
 								      SI_FMASK_TEX_OFFSET), "");
-			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 			*fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index);
 		}
 	} else {
-		*res_ptr = si_shader_ctx->resources[sampler_index];
-		*samp_ptr = si_shader_ctx->samplers[sampler_index];
-		*fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+		*res_ptr = si_shader_ctx->sampler_views[sampler_index];
+		*samp_ptr = si_shader_ctx->sampler_states[sampler_index];
+		*fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index];
 	}
 }
 
@@ -2487,7 +2453,7 @@ static void tex_fetch_args(
 		emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 		emit_data->args[0] = res;
 		emit_data->args[1] = bld_base->uint_bld.zero;
-		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
+		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
 		emit_data->arg_count = 3;
 		return;
 	}
@@ -2536,12 +2502,12 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXB)
 		address[count++] = coords[3];
 	if (opcode == TGSI_OPCODE_TXB2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	/* Pack depth comparison value */
 	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
 		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 		} else {
 			assert(ref_pos >= 0);
 			address[count++] = coords[ref_pos];
@@ -2612,7 +2578,7 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
 		address[count++] = coords[3];
 	else if (opcode == TGSI_OPCODE_TXL2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	if (count > 16) {
 		assert(!"Cannot handle more than 16 texture address parameters");
@@ -3105,10 +3071,10 @@ static void interp_fetch_args(
 		/* offset is in second src, first two channels */
 		emit_data->args[0] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 0);
+							 TGSI_CHAN_X);
 		emit_data->args[1] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 1);
+							 TGSI_CHAN_Y);
 		emit_data->arg_count = 2;
 	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
 		LLVMValueRef sample_position;
@@ -3119,7 +3085,7 @@ static void interp_fetch_args(
 		 * and place into first two channels.
 		 */
 		sample_id = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 1, 0);
+						emit_data->inst, 1, TGSI_CHAN_X);
 		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
 					     LLVMInt32TypeInContext(gallivm->context),
 					     "");
@@ -3432,15 +3398,15 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	v16i8 = LLVMVectorType(i8, 16);
 
 	params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
-	params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
-	params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
-	last_array_pointer = SI_PARAM_RESOURCE;
+	params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
+	params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
+	params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
+	last_array_pointer = SI_PARAM_SAMPLER_VIEWS;
 
 	switch (si_shader_ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
-		params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
-		last_array_pointer = SI_PARAM_VERTEX_BUFFER;
+		params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
+		last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
 		params[SI_PARAM_BASE_VERTEX] = i32;
 		params[SI_PARAM_START_INSTANCE] = i32;
 		num_params = SI_PARAM_START_INSTANCE+1;
@@ -3452,8 +3418,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
-				last_array_pointer = SI_PARAM_CONST;
-				num_params = SI_PARAM_CONST+1;
+				last_array_pointer = SI_PARAM_CONST_BUFFERS;
+				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
 				params[SI_PARAM_VS_STATE_BITS] = i32;
 				num_params = SI_PARAM_VS_STATE_BITS+1;
@@ -3610,7 +3576,7 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	const struct tgsi_shader_info * info = bld_base->info;
 	unsigned buf;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 
 	for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
 		unsigned i, num_const = info->const_file_max[buf] + 1;
@@ -3622,14 +3588,14 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 		si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
 
 		/* Load the resource descriptor */
-		si_shader_ctx->const_resource[buf] =
+		si_shader_ctx->const_buffers[buf] =
 			build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
 
 		/* Load the constants, we rely on the code sinking to do the rest */
 		for (i = 0; i < num_const * 4; ++i) {
 			si_shader_ctx->constants[buf][i] =
 				buffer_load_const(gallivm->builder,
-					si_shader_ctx->const_resource[buf],
+					si_shader_ctx->const_buffers[buf],
 					lp_build_const_int32(gallivm, i * 4),
 					bld_base->base.elem_type);
 		}
@@ -3650,23 +3616,23 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx)
 	if (num_samplers == 0)
 		return;
 
-	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
-	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
+	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 
 	/* Load the resources and samplers, we rely on the code sinking to do the rest */
 	for (i = 0; i < num_samplers; ++i) {
 		/* Resource */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
+		si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 
 		/* Sampler */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
+		si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
 
 		/* FMASK resource */
 		if (info->is_msaa_sampler[i]) {
 			offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
-			si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
+			si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] =
 				build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 		}
 	}
@@ -3741,20 +3707,19 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset)
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
+				  unsigned symbol_offset)
 {
 	unsigned i;
 	const unsigned char *config =
-		radeon_shader_binary_config_start(&shader->binary,
-						symbol_offset);
+		radeon_shader_binary_config_start(binary, symbol_offset);
 
 	/* XXX: We may be able to emit some of these values directly rather than
 	 * extracting fields to be emitted later.
 	 */
 
-	for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
+	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
 		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 		switch (reg) {
@@ -3762,25 +3727,25 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
-			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
-			shader->float_mode =  G_00B028_FLOAT_MODE(value);
-			shader->rsrc1 = value;
+			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+			conf->float_mode =  G_00B028_FLOAT_MODE(value);
+			conf->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+			conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
-			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
-			shader->rsrc2 = value;
+			conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+			conf->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
-			shader->spi_ps_input_ena = value;
+			conf->spi_ps_input_ena = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			shader->scratch_bytes_per_wave =
+			conf->scratch_bytes_per_wave =
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
@@ -3799,7 +3764,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	uint32_t scratch_rsrc_dword0 = scratch_va;
 	uint32_t scratch_rsrc_dword1 =
 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64);
 
 	for (i = 0 ; i < shader->binary.reloc_count; i++) {
 		const struct radeon_shader_reloc *reloc =
@@ -3840,80 +3805,124 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 	return 0;
 }
 
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
+				       struct pipe_debug_callback *debug)
 {
-	const struct radeon_shader_binary *binary = &shader->binary;
-	unsigned i;
-	int r;
-	bool dump  = r600_can_dump_shader(&sscreen->b,
-		shader->selector ? shader->selector->tokens : NULL);
+	char *line, *p;
+	unsigned i, count;
+
+	if (binary->disasm_string) {
+		fprintf(stderr, "\nShader Disassembly:\n\n");
+		fprintf(stderr, "%s\n", binary->disasm_string);
+
+		if (debug && debug->debug_message) {
+			/* Very long debug messages are cut off, so send the
+			 * disassembly one line at a time. This causes more
+			 * overhead, but on the plus side it simplifies
+			 * parsing of resulting logs.
+			 */
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly Begin");
 
-	si_shader_binary_read_config(sscreen, shader, 0);
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r)
-		return r;
-
-	if (dump) {
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
-			if (binary->disasm_string) {
-				fprintf(stderr, "\nShader Disassembly:\n\n");
-				fprintf(stderr, "%s\n", binary->disasm_string);
-			} else {
-				fprintf(stderr, "SI CODE:\n");
-				for (i = 0; i < binary->code_size; i+=4 ) {
-					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-					binary->code[i + 2], binary->code[i + 1],
-					binary->code[i]);
+			line = binary->disasm_string;
+			while (*line) {
+				p = strchrnul(line, '\n');
+				count = p - line;
+
+				if (count) {
+					pipe_debug_message(debug, SHADER_INFO,
+							   "%.*s", count, line);
 				}
+
+				if (!*p)
+					break;
+				line = p + 1;
 			}
+
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly End");
+		}
+	} else {
+		fprintf(stderr, "SI CODE:\n");
+		for (i = 0; i < binary->code_size; i += 4) {
+			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+				binary->code[i + 3], binary->code[i + 2],
+				binary->code[i + 1], binary->code[i]);
 		}
+	}
+}
 
+static void si_shader_dump_stats(struct si_screen *sscreen,
+			         struct si_shader_config *conf,
+				 unsigned code_size,
+			         struct pipe_debug_callback *debug,
+			         unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
 			"Scratch: %d bytes per wave\n********************\n",
-			shader->num_sgprs, shader->num_vgprs, binary->code_size,
-			shader->lds_size, shader->scratch_bytes_per_wave);
+			conf->num_sgprs, conf->num_vgprs, code_size,
+			conf->lds_size, conf->scratch_bytes_per_wave);
 	}
-	return 0;
+
+	pipe_debug_message(debug, SHADER_INFO,
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   conf->num_sgprs, conf->num_vgprs, code_size,
+			   conf->lds_size, conf->scratch_bytes_per_wave);
+}
+
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor))
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
+			si_shader_dump_disassembly(&shader->binary, debug);
+
+	si_shader_dump_stats(sscreen, &shader->config,
+			     shader->binary.code_size, debug, processor);
 }
 
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor)
 {
 	int r = 0;
-	bool dump_asm = r600_can_dump_shader(&sscreen->b,
-				shader->selector ? shader->selector->tokens : NULL);
-	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
 
-	if (dump_ir || dump_asm)
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
-	if (!si_replace_shader(count, &shader->binary)) {
-		r = radeon_llvm_compile(mod, &shader->binary,
-			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+		if (!(sscreen->b.debug_flags & DBG_NO_IR))
+			LLVMDumpModule(mod);
+	}
+
+	if (!si_replace_shader(count, binary)) {
+		r = radeon_llvm_compile(mod, binary,
+			r600_get_llvm_processor_name(sscreen->b.family), tm,
+			debug);
 		if (r)
 			return r;
 	}
 
-	r = si_shader_binary_read(sscreen, shader);
+	si_shader_binary_read_config(binary, conf, 0);
 
-	FREE(shader->binary.config);
-	FREE(shader->binary.rodata);
-	FREE(shader->binary.global_symbol_offsets);
-	if (shader->scratch_bytes_per_wave == 0) {
-		FREE(shader->binary.code);
-		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0,
-		       offsetof(struct radeon_shader_binary, disasm_string));
-	}
+	FREE(binary->config);
+	FREE(binary->global_symbol_offsets);
+	binary->config = NULL;
+	binary->global_symbol_offsets = NULL;
 	return r;
 }
 
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 				      struct si_shader_context *si_shader_ctx,
-				      struct si_shader *gs, bool dump)
+				      struct si_shader *gs, bool dump,
+				      struct pipe_debug_callback *debug)
 {
 	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -3979,8 +3988,15 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	if (dump)
 		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
 
-	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
-			    si_shader_ctx->tm, bld_base->base.gallivm->module);
+	r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
+			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
+			    bld_base->base.gallivm->module,
+			    debug, TGSI_PROCESSOR_GEOMETRY);
+	if (!r) {
+		si_shader_dump(sscreen, si_shader_ctx->shader, debug,
+			       TGSI_PROCESSOR_GEOMETRY);
+		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
+	}
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4034,7 +4050,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 }
 
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader)
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
 {
 	struct si_shader_selector *sel = shader->selector;
 	struct tgsi_token *tokens = sel->tokens;
@@ -4045,11 +4062,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	int r = 0;
 	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
 			    shader->key.ps.poly_stipple;
-	bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
 
 	if (poly_stipple) {
 		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER);
+						SI_POLY_STIPPLE_SAMPLER,
+						TGSI_FILE_INPUT);
 		tgsi_scan_shader(tokens, &stipple_shader_info);
 	}
 
@@ -4070,9 +4088,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (sel->type != PIPE_SHADER_COMPUTE)
 		shader->dx10_clamp_mode = true;
 
-	if (sel->info.uses_kill)
-		shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
-
 	shader->uses_instanceid = sel->info.uses_instanceid;
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -4147,17 +4162,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	case TGSI_PROCESSOR_FRAGMENT:
 		si_shader_ctx.radeon_bld.load_input = declare_input_fs;
 		bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
-
-		switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-		case TGSI_FS_DEPTH_LAYOUT_GREATER:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-			break;
-		case TGSI_FS_DEPTH_LAYOUT_LESS:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-			break;
-		}
 		break;
 	default:
 		assert(!"Unsupported shader type");
@@ -4188,12 +4192,21 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	mod = bld_base->base.gallivm->module;
-	r = si_compile_llvm(sscreen, shader, tm, mod);
+	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
+			    mod, debug, si_shader_ctx.type);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
 	}
 
+	si_shader_dump(sscreen, shader, debug, si_shader_ctx.type);
+
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r) {
+		fprintf(stderr, "LLVM failed to upload shader\n");
+		goto out;
+	}
+
 	radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
@@ -4202,7 +4215,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		shader->gs_copy_shader->key = shader->key;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-						    shader, dump))) {
+						    shader, dump, debug))) {
 			free(shader->gs_copy_shader);
 			shader->gs_copy_shader = NULL;
 			goto out;
@@ -4217,6 +4230,14 @@ out:
 	return r;
 }
 
+void si_shader_destroy_binary(struct radeon_shader_binary *binary)
+{
+	FREE(binary->code);
+	FREE(binary->rodata);
+	FREE(binary->relocs);
+	FREE(binary->disasm_string);
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
 	if (shader->gs_copy_shader) {
@@ -4228,8 +4249,5 @@ void si_shader_destroy(struct si_shader *shader)
 		r600_resource_reference(&shader->scratch_bo, NULL);
 
 	r600_resource_reference(&shader->bo, NULL);
-
-	FREE(shader->binary.code);
-	FREE(shader->binary.relocs);
-	FREE(shader->binary.disasm_string);
+	si_shader_destroy_binary(&shader->binary);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b0c8680ecb3..1635358d505 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -76,10 +76,10 @@ struct radeon_shader_binary;
 struct radeon_shader_reloc;
 
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
-#define SI_SGPR_CONST		2
-#define SI_SGPR_SAMPLER		4
-#define SI_SGPR_RESOURCE	6
-#define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
+#define SI_SGPR_CONST_BUFFERS	2
+#define SI_SGPR_SAMPLER_STATES	4
+#define SI_SGPR_SAMPLER_VIEWS	6
+#define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
 #define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
@@ -101,12 +101,12 @@ struct radeon_shader_reloc;
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS	0
-#define SI_PARAM_CONST		1
-#define SI_PARAM_SAMPLER	2
-#define SI_PARAM_RESOURCE	3
+#define SI_PARAM_CONST_BUFFERS	1
+#define SI_PARAM_SAMPLER_STATES	2
+#define SI_PARAM_SAMPLER_VIEWS	3
 
 /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFER	4
+#define SI_PARAM_VERTEX_BUFFERS	4
 #define SI_PARAM_BASE_VERTEX	5
 #define SI_PARAM_START_INSTANCE	6
 /* [0] = clamp vertex color */
@@ -201,6 +201,7 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	/* GS parameters. */
 	unsigned	esgs_itemsize;
 	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
@@ -210,6 +211,9 @@ struct si_shader_selector {
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;
 
+	/* PS parameters. */
+	unsigned	db_shader_control;
+
 	/* masks of "get_unique_index" bits */
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
@@ -258,6 +262,17 @@ union si_shader_key {
 	} tes; /* tessellation evaluation shader */
 };
 
+struct si_shader_config {
+	unsigned			num_sgprs;
+	unsigned			num_vgprs;
+	unsigned			lds_size;
+	unsigned			spi_ps_input_ena;
+	unsigned			float_mode;
+	unsigned			scratch_bytes_per_wave;
+	unsigned			rsrc1;
+	unsigned			rsrc2;
+};
+
 struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
@@ -266,18 +281,9 @@ struct si_shader {
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
-	struct radeon_shader_binary	binary;
-	unsigned			num_sgprs;
-	unsigned			num_vgprs;
-	unsigned			lds_size;
-	unsigned			spi_ps_input_ena;
-	unsigned			float_mode;
-	unsigned			scratch_bytes_per_wave;
-	unsigned			spi_shader_col_format;
-	unsigned			spi_shader_z_format;
-	unsigned			db_shader_control;
-	unsigned			cb_shader_mask;
 	union si_shader_key		key;
+	struct radeon_shader_binary	binary;
+	struct si_shader_config		config;
 
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
@@ -288,9 +294,6 @@ struct si_shader {
 	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
-
-	unsigned		rsrc1;
-	unsigned		rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -327,19 +330,27 @@ static inline bool si_vs_exports_prim_id(struct si_shader *shader)
 
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader);
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug);
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
+void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset);
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
+				  unsigned symbol_offset);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e5500111f43..91ccd073267 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->rsrc2;
+	ls_rsrc2 = ls->current->config.rsrc2;
 
 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->rsrc1);
+	radeon_emit(cs, ls->current->config.rsrc1);
 	radeon_emit(cs, ls_rsrc2);
 
 	/* Compute userdata SGPRs. */
@@ -818,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * ib.index_size;
 
-			u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
+			u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 			if (!out_buffer) {
 				pipe_resource_reference(&ib.buffer, NULL);
@@ -842,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			start_offset = start * ib.index_size;
 
 			u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
-				      (char*)ib.user_buffer + start_offset,
+				      256, (char*)ib.user_buffer + start_offset,
 				      &ib.offset, &ib.buffer);
 			if (!ib.buffer)
 				return;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8700590435f..64adf699604 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader)
 	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
 
 	num_user_sgprs = SI_LS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
 
-	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
-			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+	shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_shader *shader)
@@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 
 	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
 	 * buffer offset. */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 		       S_00B42C_USER_SGPR(num_user_sgprs) |
-		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_es(struct si_shader *shader)
@@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-		       S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B328_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 		si_set_tesseval_regs(shader, pm4);
@@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 
 	num_user_sgprs = SI_GS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
 	if ((num_user_sgprs + 2) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-		       S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 		       S_00B22C_USER_SGPR(num_user_sgprs) |
-		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
-		       S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B128_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
@@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader)
 		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
 		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
-		       S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 	if (window_space)
 		si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 			       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -387,6 +387,8 @@ static void si_shader_ps(struct si_shader *shader)
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned i, spi_ps_in_control;
+	unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
+	unsigned colors_written, export_16bpc;
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = 0;
 	uint64_t va;
@@ -422,19 +424,43 @@ static void si_shader_ps(struct si_shader *shader)
 		}
 	}
 
-	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
-		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+	/* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
+	colors_written = info->colors_written;
+	export_16bpc = shader->key.ps.export_16bpc;
+
+	if (info->colors_written == 0x1 &&
+	    info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+		colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
+	}
+
+	while (colors_written) {
+		i = u_bit_scan(&colors_written);
+		if (export_16bpc & (1 << i))
+			spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i);
+		else
+			spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i);
+		cb_shader_mask |= 0xf << (4 * i);
+	}
+
+	/* Set interpolation controls. */
+	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
+		       G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
 
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
 			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
+	/* Set registers. */
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
 
-	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, shader->spi_shader_z_format);
-	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT,
-		       shader->spi_shader_col_format);
-	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask);
+	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT,
+		       info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR :
+		       info->writes_stencil ? V_028710_SPI_SHADER_32_GR :
+		       info->writes_z ? V_028710_SPI_SHADER_32_R :
+		       V_028710_SPI_SHADER_ZERO);
+
+	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format);
+	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask);
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -442,7 +468,7 @@ static void si_shader_ps(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
 
 	num_user_sgprs = SI_PS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -451,13 +477,13 @@ static void si_shader_ps(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
-		       S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B028_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-		       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
+		       S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
 		       S_00B02C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
@@ -496,6 +522,16 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 	}
 }
 
+static unsigned si_get_alpha_test_func(struct si_context *sctx)
+{
+	/* Alpha-test should be disabled if colorbuffer 0 is integer. */
+	if (sctx->queued.named.dsa &&
+	    !sctx->framebuffer.cb0_is_integer)
+		return sctx->queued.named.dsa->alpha_func;
+
+	return PIPE_FUNC_ALWAYS;
+}
+
 /* Compute the key for the hw shader variant */
 static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
@@ -537,8 +573,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+		    sel->info.colors_written == 0x1)
 			key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
 		key->ps.export_16bpc = sctx->framebuffer.export_16bpc;
 
 		if (rs) {
@@ -562,11 +600,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			key->ps.clamp_color = rs->clamp_fragment_color;
 		}
 
-		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-		/* Alpha-test should be disabled if colorbuffer 0 is integer. */
-		if (sctx->queued.named.dsa &&
-		    !sctx->framebuffer.cb0_is_integer)
-			key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+		key->ps.alpha_func = si_get_alpha_test_func(sctx);
 		break;
 	}
 	default:
@@ -616,7 +650,7 @@ static int si_shader_select(struct pipe_context *ctx,
 	shader->selector = sel;
 	shader->key = key;
 
-	r = si_shader_create(sctx->screen, sctx->tm, shader);
+	r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
 	if (unlikely(r)) {
 		R600_ERR("Failed to build shader variant (type=%u) %d\n",
 			 sel->type, r);
@@ -731,6 +765,25 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
+	/* DB_SHADER_CONTROL */
+	sel->db_shader_control =
+		S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+		S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+		S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+	switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+	case TGSI_FS_DEPTH_LAYOUT_GREATER:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_LESS:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+		break;
+	}
+
+	/* Pre-compilation. */
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
 
@@ -987,7 +1040,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom
 	if (!ps)
 		return;
 
-	input_ena = ps->spi_ps_input_ena;
+	input_ena = ps->config.spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
 	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
@@ -1216,7 +1269,7 @@ static int si_update_scratch_buffer(struct si_context *sctx,
 		return 0;
 
 	/* This shader doesn't need a scratch buffer */
-	if (shader->scratch_bytes_per_wave == 0)
+	if (shader->config.scratch_bytes_per_wave == 0)
 		return 0;
 
 	/* This shader is already configured to use the current
@@ -1248,7 +1301,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 
 static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return shader ? shader->scratch_bytes_per_wave : 0;
+	return shader ? shader->config.scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
@@ -1549,6 +1602,10 @@ bool si_update_shaders(struct si_context *sctx)
 	si_update_vgt_shader_config(sctx);
 
 	if (sctx->ps_shader.cso) {
+		unsigned db_shader_control =
+			sctx->ps_shader.cso->db_shader_control |
+			S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
 		r = si_shader_select(ctx, &sctx->ps_shader);
 		if (r)
 			return false;
@@ -1568,8 +1625,8 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
 		}
 
-		if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
-			sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
+		if (sctx->ps_db_shader_control != db_shader_control) {
+			sctx->ps_db_shader_control = db_shader_control;
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 8e5e24217a5..d5c4aaae638 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -37,6 +37,7 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
+#include "sp_setup.h"
 
 
 /** Do polygon stipple in the draw module? */
@@ -117,17 +118,17 @@ struct softpipe_context {
    unsigned const_buffer_size[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
 
    /** Vertex format */
+   struct sp_setup_info setup_info;
    struct vertex_info vertex_info;
-   struct vertex_info vertex_info_vbuf;
 
    /** Which vertex shader output slot contains point size */
-   int psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   int viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which vertex shader output slot contains layer */
-   int layer_slot;
+   int8_t layer_slot;
 
    /** The reduced version of the primitive supplied by the state tracker */
    unsigned reduced_api_prim;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index f8a3eacdb37..95d1ac1514f 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -161,7 +161,7 @@ sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
    struct setup_context *setup = cvbr->setup;
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
@@ -358,7 +358,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
    struct setup_context *setup = cvbr->setup;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 76105b4c0ec..c28d28d5f5d 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -223,7 +223,7 @@ softpipe_get_query_result(struct pipe_context *pipe,
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       memcpy(vresult, &sq->stats,
-             sizeof(struct pipe_query_data_pipeline_statistics));;
+             sizeof(struct pipe_query_data_pipeline_statistics));
       break;
    case PIPE_QUERY_GPU_FINISHED:
       vresult->b = TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 9939720e259..29e392b94e8 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -251,6 +251,13 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index ac2d97825ce..ffe49260b9a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -38,7 +38,6 @@
 #include "sp_setup.h"
 #include "sp_state.h"
 #include "draw/draw_context.h"
-#include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -599,10 +598,12 @@ setup_tri_coefficients(struct setup_context *setup)
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float v[3];
 
+   assert(sinfo->valid);
+
    /* z and w are done by linear interpolation:
     */
    v[0] = setup->vmin[0][2];
@@ -618,15 +619,16 @@ setup_tri_coefficients(struct setup_context *setup)
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         for (j = 0; j < TGSI_NUM_CHANNELS; j++)
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
+         for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         }
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -636,7 +638,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -646,7 +648,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -966,11 +968,13 @@ setup_line_coefficients(struct setup_context *setup,
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float area;
    float v[2];
 
+   assert(sinfo->valid);
+
    /* use setup->vmin, vmax to point to vertices */
    if (softpipe->rasterizer->flatshade_first)
       setup->vprovoke = v0;
@@ -1001,15 +1005,15 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1018,7 +1022,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1027,7 +1031,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -1236,7 +1240,7 @@ sp_setup_point(struct setup_context *setup,
    const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
    const float x = v0[0][0];  /* Note: data[0] is always position */
    const float y = v0[0][1];
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    uint layer = 0;
    unsigned viewport_index = 0;
@@ -1245,6 +1249,8 @@ sp_setup_point(struct setup_context *setup,
    print_vertex(setup, v0);
 #endif
 
+   assert(sinfo->valid);
+
    if (setup->softpipe->no_rast || setup->softpipe->rasterizer->rasterizer_discard)
       return;
 
@@ -1285,22 +1291,22 @@ sp_setup_point(struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
          /* fall-through */
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             point_persp_coeff(setup, setup->vprovoke,
                               &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 191494acbb8..a54dc5dad0c 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -30,11 +30,30 @@
 struct setup_context;
 struct softpipe_context;
 
+/**
+ * Attribute interpolation mode
+ */
+enum sp_interp_mode {
+   SP_INTERP_POS,       /**< special case for frag position */
+   SP_INTERP_CONSTANT,
+   SP_INTERP_LINEAR,
+   SP_INTERP_PERSPECTIVE
+};
+
+
+struct sp_setup_info {
+   unsigned valid;
+   struct {
+      unsigned interp:8;      /**< SP_INTERP_X */
+      int src_index:8;
+   } attrib[PIPE_MAX_SHADER_OUTPUTS];
+};
+
 void 
-sp_setup_tri( struct setup_context *setup,
-	   const float (*v0)[4],
-	   const float (*v1)[4],
-	   const float (*v2)[4] );
+sp_setup_tri(struct setup_context *setup,
+             const float (*v0)[4],
+             const float (*v1)[4],
+             const float (*v2)[4]);
 
 void
 sp_setup_line(struct setup_context *setup,
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index c35534c931d..16a2897f526 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -175,9 +175,6 @@ softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
 
 
 struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe);
-
-struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 7e998af1325..d4d03f1be50 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -48,7 +48,7 @@
 static void
 invalidate_vertex_layout(struct softpipe_context *softpipe)
 {
-   softpipe->vertex_info.num_attribs =  0;
+   softpipe->setup_info.valid =  0;
 }
 
 
@@ -57,50 +57,64 @@ invalidate_vertex_layout(struct softpipe_context *softpipe)
  * (simple float[][4]) used by the 'draw' module into vertices for
  * rasterization.
  *
- * This function validates the vertex layout and returns a pointer to a
- * vertex_info object.
+ * This function validates the vertex layout.
  */
-struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe)
+static void
+softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 {
-   struct vertex_info *vinfo = &softpipe->vertex_info;
-   int vs_index;
+   struct sp_setup_info *sinfo = &softpipe->setup_info;
 
-   if (vinfo->num_attribs == 0) {
-      /* compute vertex layout now */
+   if (sinfo->valid == 0) {
       const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info;
-      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_shader_outputs(softpipe->draw);
+      struct vertex_info *vinfo = &softpipe->vertex_info;
       uint i;
-
-      /* Tell draw_vbuf to simply emit the whole post-xform vertex
-       * as-is.  No longer any need to try and emit draw vertex_header
-       * info.
+      int vs_index;
+      /*
+       * This doesn't quite work right (wrt face injection, prim id,
+       * wide points) - hit a couple assertions, misrenderings plus
+       * memory corruption. Albeit could fix (the former two) by calling
+       * this "more often" (rasterizer changes etc.). (The latter would
+       * need to be included in draw_prepare_shader_outputs, but it looks
+       * like that would potentially allocate quite some unused additional
+       * vertex outputs.)
+       * draw_prepare_shader_outputs(softpipe->draw);
        */
-      vinfo_vbuf->num_attribs = 0;
-      for (i = 0; i < num; i++) {
-	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-      }
-      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
-       * Loop over fragment shader inputs, searching for the matching output
-       * from the vertex shader.
+       * Those can't actually be 0 (because pos is always at 0).
+       * But use ints anyway to avoid confusion (in vs outputs, they
+       * can very well be at pos 0).
        */
+      softpipe->viewport_index_slot = -1;
+      softpipe->layer_slot = -1;
+      softpipe->psize_slot = -1;
+
       vinfo->num_attribs = 0;
+
+      /*
+       * Put position always first (setup needs it there).
+       */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_POSITION, 0);
+
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+
+      /*
+       * Match FS inputs against VS outputs, emitting the necessary
+       * attributes.
+       */
       for (i = 0; i < fsInfo->num_inputs; i++) {
-         int src;
-         enum interp_mode interp = INTERP_LINEAR;
+         enum sp_interp_mode interp = SP_INTERP_LINEAR;
 
          switch (fsInfo->input_interpolate[i]) {
          case TGSI_INTERPOLATE_CONSTANT:
-            interp = INTERP_CONSTANT;
+            interp = SP_INTERP_CONSTANT;
             break;
          case TGSI_INTERPOLATE_LINEAR:
-            interp = INTERP_LINEAR;
+            interp = SP_INTERP_LINEAR;
             break;
          case TGSI_INTERPOLATE_PERSPECTIVE:
-            interp = INTERP_PERSPECTIVE;
+            interp = SP_INTERP_PERSPECTIVE;
             break;
          case TGSI_INTERPOLATE_COLOR:
             assert(fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR);
@@ -111,88 +125,121 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
          switch (fsInfo->input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
-            interp = INTERP_POS;
+            interp = SP_INTERP_POS;
             break;
 
          case TGSI_SEMANTIC_COLOR:
             if (fsInfo->input_interpolate[i] == TGSI_INTERPOLATE_COLOR) {
                if (softpipe->rasterizer->flatshade)
-                  interp = INTERP_CONSTANT;
+                  interp = SP_INTERP_CONSTANT;
                else
-                  interp = INTERP_PERSPECTIVE;
+                  interp = SP_INTERP_PERSPECTIVE;
             }
             break;
          }
 
-         /* this includes texcoords and varying vars */
-         src = draw_find_shader_output(softpipe->draw,
-                                       fsInfo->input_semantic_name[i],
-                                       fsInfo->input_semantic_index[i]);
-	 if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
-	   /* try and find a bcolor */
-	   src = draw_find_shader_output(softpipe->draw,
-					 TGSI_SEMANTIC_BCOLOR, fsInfo->input_semantic_index[i]);
+         /*
+          * Search for each input in current vs output:
+          */
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            fsInfo->input_semantic_name[i],
+                                            fsInfo->input_semantic_index[i]);
+
+         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             vs_index == -1) {
+            /*
+             * try and find a bcolor.
+             * Note that if there's both front and back color, draw will
+             * have copied back to front color already.
+             */
+            vs_index = draw_find_shader_output(softpipe->draw,
+                                               TGSI_SEMANTIC_BCOLOR,
+                                               fsInfo->input_semantic_index[i]);
+         }
 
-         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+         sinfo->attrib[i].interp = interp;
+         /* extremely pointless index map */
+         sinfo->attrib[i].src_index = i + 1;
+         /*
+          * For vp index and layer, if the fs requires them but the vs doesn't
+          * provide them, draw (vbuf) will give us the required 0 (slot -1).
+          * (This means in this case we'll also use those slots in setup, which
+          * isn't necessary but they'll contain the correct (0) value.)
+          */
+         if (fsInfo->input_semantic_name[i] ==
+                    TGSI_SEMANTIC_VIEWPORT_INDEX) {
+            softpipe->viewport_index_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+         } else {
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need pointsize as well. */
+      /* Figure out if we need pointsize as well.
+       */
       vs_index = draw_find_shader_output(softpipe->draw,
                                          TGSI_SEMANTIC_PSIZE, 0);
 
       if (vs_index >= 0) {
-         softpipe->psize_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         softpipe->psize_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
 
-      /* Figure out if we need viewport index */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->viewport_index_slot = 0;
+      /* Figure out if we need viewport index (if it wasn't already in fs input) */
+      if (softpipe->viewport_index_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->viewport_index_slot =(int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need layer */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_LAYER,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->layer_slot = 0;
+      /* Figure out if we need layer (if it wasn't already in fs input) */
+      if (softpipe->layer_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_LAYER,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
       draw_compute_vertex_size(vinfo);
+      softpipe->setup_info.valid = 1;
    }
-
-   return vinfo;
+   return;
 }
 
 
 /**
  * Called from vbuf module.
  *
- * Note that there's actually two different vertex layouts in softpipe.
- *
- * The normal one is computed in softpipe_get_vertex_info() above and is
- * used by the point/line/tri "setup" code.
- *
- * The other one (this one) is only used by the vbuf module (which is
- * not normally used by default but used in testing).  For the vbuf module,
- * we basically want to pass-through the draw module's vertex layout as-is.
- * When the softpipe vbuf code begins drawing, the normal vertex layout
- * will come into play again.
+ * This will trigger validation of the vertex layout (and also compute
+ * the required information for setup).
  */
 struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 {
-   (void) softpipe_get_vertex_info(softpipe);
-   return &softpipe->vertex_info_vbuf;
+   softpipe_compute_vertex_info(softpipe);
+   return &softpipe->vertex_info;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index dce0404de5b..f0d66a53ec6 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -64,7 +64,8 @@ create_fs_variant(struct softpipe_context *softpipe,
          /* get new shader that implements polygon stippling */
          var->tokens = 
             util_pstipple_create_fragment_shader(curfs->tokens,
-                                                 &var->stipple_sampler_unit, 0);
+                                                 &var->stipple_sampler_unit, 0,
+                                                 TGSI_FILE_INPUT);
       }
       else
 #endif
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 10442cb46e7..e45b3e72aeb 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -337,7 +337,7 @@ SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
    mipSizes[0].height = height;
    mipSizes[0].depth = 1;
 
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
@@ -372,7 +372,7 @@ SVGA3D_DestroySurface(struct svga_winsys_context *swc,
    
    swc->surface_relocation(swc, &cmd->sid, NULL, sid,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
@@ -473,6 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -543,6 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1016,7 +1018,7 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
    *decls = declArray;
    *ranges = rangeArray;
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1720,6 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc,
    cmd->box = *box;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1746,6 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1775,6 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc,
    cmd->image.mipmap = mipLevel;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1801,6 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1829,6 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc,
    cmd->invertBox = invertBox;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
index 5c121089f91..4cd9d5b9d1e 100644
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -535,7 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc,
 
    SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -551,7 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation,
                        baseVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -568,7 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount,
                        startVertexLocation, startInstanceLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -588,7 +588,7 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc,
                        startInstanceLocation);
 
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -598,7 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc)
 {
    SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index d407785ddd9..b10eb45e548 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -46,7 +46,6 @@
 #include "svga_winsys.h"
 
 #define CONST0_UPLOAD_DEFAULT_SIZE 65536
-#define CONST0_UPLOAD_ALIGNMENT 256
 
 DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE);
@@ -220,8 +219,8 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
 
    svga->const0_upload = u_upload_create(&svga->pipe,
                                          CONST0_UPLOAD_DEFAULT_SIZE,
-                                         CONST0_UPLOAD_ALIGNMENT,
-                                         PIPE_BIND_CONSTANT_BUFFER);
+                                         PIPE_BIND_CONSTANT_BUFFER,
+                                         PIPE_USAGE_STREAM);
    if (!svga->const0_upload)
       goto cleanup;
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 78e346a92b9..e4f29b8497e 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -74,6 +74,8 @@
  */
 #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int))
 
+#define CONST0_UPLOAD_ALIGNMENT 256
+
 struct draw_vertex_shader;
 struct draw_fragment_shader;
 struct svga_shader_variant;
@@ -312,7 +314,7 @@ struct svga_hw_view_state
    struct svga_sampler_view *v;
    unsigned min_lod;
    unsigned max_lod;
-   int dirty;
+   boolean dirty;
 };
 
 /* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )
@@ -343,6 +345,11 @@ struct svga_hw_draw_state
    SVGA3dElementLayoutId layout_id;
    SVGA3dPrimitiveType topology;
 
+   /** Vertex buffer state */
+   SVGA3dVertexBuffer vbuffers[PIPE_MAX_ATTRIBS];
+   struct svga_winsys_surface *vbuffer_handles[PIPE_MAX_ATTRIBS];
+   unsigned num_vbuffers;
+
    struct svga_winsys_surface *ib;  /**< index buffer for drawing */
    SVGA3dSurfaceFormat ib_format;
    unsigned ib_offset;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 2d3631d6f9c..80526ed4d15 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -517,11 +517,27 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
          buffers[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
       }
       if (vbuf_count > 0) {
-         ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
-                                              0,    /* startBuffer */
-                                              buffers, vb_handle);
-         if (ret != PIPE_OK)
-            return ret;
+         /* If we haven't yet emitted a drawing command or if any
+          * vertex buffer state is changing, issue that state now.
+          */
+         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) ||
+             vbuf_count != svga->state.hw_draw.num_vbuffers ||
+             memcmp(buffers, svga->state.hw_draw.vbuffers,
+                    vbuf_count * sizeof(buffers[0])) ||
+             memcmp(vb_handle, svga->state.hw_draw.vbuffer_handles,
+                    vbuf_count * sizeof(vb_handle[0]))) {
+            ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
+                                                 0,    /* startBuffer */
+                                                 buffers, vb_handle);
+            if (ret != PIPE_OK)
+               return ret;
+
+            svga->state.hw_draw.num_vbuffers = vbuf_count;
+            memcpy(svga->state.hw_draw.vbuffers, buffers,
+                   vbuf_count * sizeof(buffers[0]));
+            memcpy(svga->state.hw_draw.vbuffer_handles, vb_handle,
+                   vbuf_count * sizeof(vb_handle[0]));
+         }
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index fa1744fc33e..8e0db539574 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -368,13 +368,16 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe,
    struct svga_context *svga = svga_context(pipe);
    struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
 
+   if (!raster ||
+       !svga->curr.rast ||
+       raster->templ.poly_stipple_enable !=
+       svga->curr.rast->templ.poly_stipple_enable) {
+      svga->dirty |= SVGA_NEW_STIPPLE;
+   }
+
    svga->curr.rast = raster;
 
    svga->dirty |= SVGA_NEW_RAST;
-
-   if (raster && raster->templ.poly_stipple_enable) {
-      svga->dirty |= SVGA_NEW_STIPPLE;
-   }
 }
 
 static void
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 95241176510..3e778f0a087 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -287,6 +287,7 @@ svga_bind_sampler_states(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    unsigned i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= PIPE_MAX_SAMPLERS);
@@ -295,8 +296,15 @@ svga_bind_sampler_states(struct pipe_context *pipe,
    if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT)
       return;
 
-   for (i = 0; i < num; i++)
+   for (i = 0; i < num; i++) {
+      if (svga->curr.sampler[shader][start + i] != samplers[i])
+         any_change = TRUE;
       svga->curr.sampler[shader][start + i] = samplers[i];
+   }
+
+   if (!any_change) {
+      return;
+   }
 
    /* find highest non-null sampler[] entry */
    {
@@ -405,6 +413,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
    unsigned flag_1d = 0;
    unsigned flag_srgb = 0;
    uint i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= Elements(svga->curr.sampler_views[shader]));
@@ -422,6 +431,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
          pipe_sampler_view_release(pipe, &svga->curr.sampler_views[shader][start + i]);
          pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
                                      views[i]);
+         any_change = TRUE;
       }
 
       if (!views[i])
@@ -434,6 +444,10 @@ svga_set_sampler_views(struct pipe_context *pipe,
          flag_1d |= 1 << (start + i);
    }
 
+   if (!any_change) {
+      return;
+   }
+
    /* find highest non-null sampler_views[] entry */
    {
       unsigned j = MAX2(svga->curr.num_sampler_views[shader], start + num);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 8c5cff5abc1..7f7ceab0aa5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -221,7 +221,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    struct svga_3d_update_gb_image *whole_update_cmd = NULL;
    uint32 numBoxes = sbuf->map.num_ranges;
    struct pipe_resource *dummy;
-   unsigned int i;
+   unsigned i;
 
    assert(numBoxes);
    assert(sbuf->dma.updates == NULL);
@@ -308,6 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    pipe_resource_reference(&dummy, &sbuf->b.b);
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
@@ -381,6 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga,
 
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index fca501bc47d..0f41e4ea254 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -342,6 +342,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -349,6 +351,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -384,6 +388,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    }
 
@@ -457,6 +464,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -515,6 +523,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -606,6 +615,7 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 2cf41134bd6..8ab1693088a 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -613,7 +613,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
     */
    new_buf_size = align(new_buf_size, 16);
 
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size, &offset,
+   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+                  CONST0_UPLOAD_ALIGNMENT, &offset,
                   &dst_buffer, &dst_map);
    if (!dst_map) {
       if (src_map)
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index e392778c2fb..bac91669be1 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -452,6 +452,7 @@ struct svga_tracked_state svga_hw_fs =
     SVGA_NEW_TEXTURE_BINDING |
     SVGA_NEW_NEED_SWTNL |
     SVGA_NEW_RAST |
+    SVGA_NEW_STIPPLE |
     SVGA_NEW_REDUCED_PRIMITIVE |
     SVGA_NEW_SAMPLER |
     SVGA_NEW_FRAME_BUFFER |
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 24574c1bf85..a103dab25fe 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -173,8 +173,11 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
       return;
    }
 
+   /* SVGA_NEW_PRESCALE */
    key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
                            (svga->curr.gs == NULL);
+
+   /* SVGA_NEW_RAST */
    key->vs.allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
 
    /* SVGA_NEW_FS */
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 79dc0bf580c..4d21f4f0e60 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -220,8 +220,6 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
    struct draw_context *draw = svga->swtnl.draw;
    struct vertex_info *vinfo = &svga_render->vertex_info;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
-   const enum interp_mode colorInterp =
-      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    struct svga_fragment_shader *fs = svga->curr.fs;
    int offset = 0;
    int nr_decls = 0;
@@ -236,7 +234,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
    /* always add position */
    src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, src);
    vinfo->attrib[0].emit = EMIT_4F;
    vdecl[0].array.offset = offset;
    vdecl[0].identity.method = SVGA3D_DECLMETHOD_DEFAULT;
@@ -257,14 +255,14 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
       switch (sem_name) {
       case TGSI_SEMANTIC_COLOR:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          offset += 16;
          nr_decls++;
          break;
       case TGSI_SEMANTIC_GENERIC:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          vdecl[nr_decls].identity.usageIndex =
@@ -273,7 +271,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
          nr_decls++;
          break;
       case TGSI_SEMANTIC_FOG:
-         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
          assert(vdecl[nr_decls].identity.usageIndex == 0);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index dbb90f7654e..489e68f88e8 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -166,7 +166,7 @@ scalar(struct src_register src, unsigned comp)
 static boolean
 svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -179,7 +179,7 @@ svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 static int
 svga_arl_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -1175,7 +1175,7 @@ emit_div(struct svga_shader_emitter *emit,
    const struct src_register src1 =
       translate_src_register(emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
-   int i;
+   unsigned i;
 
    /* For each enabled element, perform a RCP instruction.  Note that
     * RCP is scalar in SVGA3D:
@@ -1822,7 +1822,7 @@ emit_tex_swizzle(struct svga_shader_emitter *emit,
    const unsigned swizzleIn[4] = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
    unsigned srcSwizzle[4];
    unsigned srcWritemask = 0x0, zeroWritemask = 0x0, oneWritemask = 0x0;
-   int i;
+   unsigned i;
 
    /* build writemasks and srcSwizzle terms */
    for (i = 0; i < 4; i++) {
@@ -3371,7 +3371,7 @@ emit_light_twoside(struct svga_shader_emitter *emit)
    struct src_register back[2];
    SVGA3dShaderDestToken color[2];
    int count = emit->internal_color_count;
-   int i;
+   unsigned i;
    SVGA3dShaderInstToken if_token;
 
    if (count == 0)
@@ -3698,7 +3698,7 @@ static boolean
 pre_parse_add_indirect( struct svga_shader_emitter *emit,
                         int num, int current_arl)
 {
-   int i;
+   unsigned i;
    assert(num < 0);
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
@@ -3844,7 +3844,8 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
    if (emit->unit == PIPE_SHADER_FRAGMENT && emit->key.fs.pstipple) {
       unsigned unit;
 
-      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                        TGSI_FILE_INPUT);
 
       if (new_tokens) {
          /* Setup texture state for stipple */
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index c979f4a8a56..1223e446055 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2298,11 +2298,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
       return TRUE;
 
+#if 0
    case TGSI_FILE_RESOURCE:
       /*opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;*/
       /* XXX more, VGPU10_RETURN_TYPE_FLOAT */
       assert(!"TGSI_FILE_RESOURCE not handled yet");
       return FALSE;
+#endif
 
    case TGSI_FILE_ADDRESS:
       emit->num_address_regs = MAX2(emit->num_address_regs,
@@ -6170,6 +6172,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
 
       while (adjust_mask) {
          unsigned index = u_bit_scan(&adjust_mask);
+
+         /* skip the instruction if this vertex attribute is not being used */
+         if (emit->info.input_usage_mask[index] == 0)
+            continue;
+
          unsigned tmp = emit->vs.adjusted_input[index];
          struct tgsi_full_src_register input_src =
             make_src_reg(TGSI_FILE_INPUT, index);
@@ -6604,7 +6611,8 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
       tgsi_dump(tokens,0);
    }
 
-   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                     TGSI_FILE_INPUT);
 
    emit->fs.pstipple_sampler_unit = unit;
 
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 3129e46ed06..562c6690fc1 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -85,7 +85,7 @@ struct winsys_handle;
 #define SVGA_QUERY_FLAG_SET        (1 << 0)
 #define SVGA_QUERY_FLAG_REF        (1 << 1)
 
-#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0)
+#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0)  /* Can preemptively flush */
 
 /** Opaque surface handle */
 struct svga_winsys_surface;
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 312b006f96e..a0888f23265 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -254,8 +254,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         if (!vc4->primconvert)
                 goto fail;
 
-        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
-                                        PIPE_BIND_INDEX_BUFFER);
+        vc4->uploader = u_upload_create(pctx, 16 * 1024,
+                                        PIPE_BIND_INDEX_BUFFER,
+                                        PIPE_USAGE_STREAM);
 
         vc4_debug |= saved_shaderdb_flag;
 
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index c00855698b8..9b0b540d3fc 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -319,7 +319,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                         if (vc4->indexbuf.user_buffer) {
                                 prsc = NULL;
                                 u_upload_data(vc4->uploader, 0,
-                                              info->count * index_size,
+                                              info->count * index_size, 4,
                                               vc4->indexbuf.user_buffer,
                                               &offset, &prsc);
                         } else {
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index aea2b9dbe87..b8ce377ff6b 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -145,43 +145,6 @@ qir_opt_algebraic(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        if (is_zero(c, inst->src[1])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= (QOP_SEL_X_Y_ZS - QOP_SEL_X_0_ZS);
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        if (is_zero(c, inst->src[0])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent, flipping the
-                                 * condition being evaluated since the operand
-                                 * order is flipped.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= QOP_SEL_X_Y_ZS;
-                                inst->op ^= 1;
-                                inst->op += QOP_SEL_X_0_ZS;
-                                inst->src[0] = inst->src[1];
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        break;
-
                 case QOP_FMIN:
                         if (is_1f(c, inst->src[1]) &&
                             inst->src[0].pack >= QPU_UNPACK_8D_REP &&
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index da0d21111a0..3e402d048ba 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -89,7 +89,7 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                 range->dst_offset = c->next_ubo_dst_offset;
                 c->next_ubo_dst_offset += range->size;
                 c->num_ubo_ranges++;
-        };
+        }
 
         offset -= range->src_offset;
 
@@ -204,27 +204,6 @@ ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
         return r;
 };
 
-static struct qreg
-get_swizzled_channel(struct vc4_compile *c,
-                     struct qreg *srcs, int swiz)
-{
-        switch (swiz) {
-        default:
-        case UTIL_FORMAT_SWIZZLE_NONE:
-                fprintf(stderr, "warning: unknown swizzle\n");
-                /* FALLTHROUGH */
-        case UTIL_FORMAT_SWIZZLE_0:
-                return qir_uniform_f(c, 0.0);
-        case UTIL_FORMAT_SWIZZLE_1:
-                return qir_uniform_f(c, 1.0);
-        case UTIL_FORMAT_SWIZZLE_X:
-        case UTIL_FORMAT_SWIZZLE_Y:
-        case UTIL_FORMAT_SWIZZLE_Z:
-        case UTIL_FORMAT_SWIZZLE_W:
-                return srcs[swiz];
-        }
-}
-
 static inline struct qreg
 qir_SAT(struct vc4_compile *c, struct qreg val)
 {
@@ -275,7 +254,7 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
                                    qir_uniform_f(c, 2.4));
 
         qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045)));
-        return qir_SEL_X_Y_NS(c, low, high);
+        return qir_SEL(c, QPU_COND_NS, low, high);
 }
 
 static struct qreg
@@ -338,30 +317,20 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
         struct qreg tex = qir_TEX_RESULT(c);
         c->num_texture_samples++;
 
-        struct qreg texture_output[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         enum pipe_format format = c->key->tex[unit].format;
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
                 for (int i = 0; i < 4; i++)
-                        texture_output[i] = scaled;
+                        dest[i] = scaled;
         } else {
-                struct qreg tex_result_unpacked[4];
                 for (int i = 0; i < 4; i++)
-                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-
-                const uint8_t *format_swiz =
-                        vc4_get_format_swizzle(c->key->tex[unit].format);
-                for (int i = 0; i < 4; i++) {
-                        texture_output[i] =
-                                get_swizzled_channel(c, tex_result_unpacked,
-                                                     format_swiz[i]);
-                }
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
         }
 }
 
@@ -470,12 +439,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 
         enum pipe_format format = c->key->tex[unit].format;
 
-        struct qreg unpacked[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
 
-                struct qreg one = qir_uniform_f(c, 1.0f);
+                struct qreg u0 = qir_uniform_f(c, 0.0f);
+                struct qreg u1 = qir_uniform_f(c, 1.0f);
                 if (c->key->tex[unit].compare_mode) {
                         if (has_proj)
                                 compare = qir_FMUL(c, compare, proj);
@@ -485,31 +455,31 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                 depth_output = qir_uniform_f(c, 0.0f);
                                 break;
                         case PIPE_FUNC_ALWAYS:
-                                depth_output = one;
+                                depth_output = u1;
                                 break;
                         case PIPE_FUNC_EQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
                                 break;
                         case PIPE_FUNC_NOTEQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
                                 break;
                         case PIPE_FUNC_GREATER:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         case PIPE_FUNC_GEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LESS:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         }
                 } else {
@@ -517,29 +487,15 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 }
 
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = depth_output;
+                        dest[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-        }
-
-        const uint8_t *format_swiz = vc4_get_format_swizzle(format);
-        struct qreg texture_output[4];
-        for (int i = 0; i < 4; i++) {
-                texture_output[i] = get_swizzled_channel(c, unpacked,
-                                                         format_swiz[i]);
-        }
-
-        if (util_format_is_srgb(format)) {
-                for (int i = 0; i < 3; i++)
-                        texture_output[i] = qir_srgb_decode(c,
-                                                            texture_output[i]);
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
         }
 }
 
@@ -553,9 +509,8 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
         struct qreg diff = qir_FSUB(c, src, trunc);
         qir_SF(c, diff);
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
-                              diff);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
 }
 
 /**
@@ -572,9 +527,8 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, src, trunc));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 /**
@@ -591,9 +545,8 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, trunc, src));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 static struct qreg
@@ -668,10 +621,13 @@ ntq_fcos(struct vc4_compile *c, struct qreg src)
 static struct qreg
 ntq_fsign(struct vc4_compile *c, struct qreg src)
 {
+        struct qreg t = qir_get_temp(c);
+
         qir_SF(c, src);
-        return qir_SEL_X_Y_NC(c,
-                              qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)),
-                              qir_uniform_f(c, -1.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
+        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
+        return t;
 }
 
 static void
@@ -888,6 +844,100 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
         return qir_UNPACK_8_I(c, base, offset_bit / 8);
 }
 
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
+                    nir_alu_instr *compare_instr,
+                    nir_alu_instr *sel_instr)
+{
+        enum qpu_cond cond;
+
+        switch (compare_instr->op) {
+        case nir_op_feq:
+        case nir_op_ieq:
+        case nir_op_seq:
+                cond = QPU_COND_ZS;
+                break;
+        case nir_op_fne:
+        case nir_op_ine:
+        case nir_op_sne:
+                cond = QPU_COND_ZC;
+                break;
+        case nir_op_fge:
+        case nir_op_ige:
+        case nir_op_uge:
+        case nir_op_sge:
+                cond = QPU_COND_NC;
+                break;
+        case nir_op_flt:
+        case nir_op_ilt:
+        case nir_op_slt:
+                cond = QPU_COND_NS;
+                break;
+        default:
+                return false;
+        }
+
+        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+
+        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
+                qir_SF(c, qir_FSUB(c, src0, src1));
+        else
+                qir_SF(c, qir_SUB(c, src0, src1));
+
+        switch (sel_instr->op) {
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+                break;
+
+        case nir_op_bcsel:
+                *dest = qir_SEL(c, cond,
+                                ntq_get_alu_src(c, sel_instr, 1),
+                                ntq_get_alu_src(c, sel_instr, 2));
+                break;
+
+        default:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
+                break;
+        }
+
+        return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
+                                  struct qreg *src)
+{
+        if (!instr->src[0].src.is_ssa)
+                goto out;
+        nir_alu_instr *compare =
+                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        if (!compare)
+                goto out;
+
+        struct qreg dest;
+        if (ntq_emit_comparison(c, &dest, compare, instr))
+                return dest;
+
+out:
+        qir_SF(c, src[0]);
+        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -974,7 +1024,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_i2b:
         case nir_op_f2b:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
+                *dest = qir_SEL(c, QPU_COND_ZC,
+                                qir_uniform_ui(c, ~0),
+                                qir_uniform_ui(c, 0));
                 break;
 
         case nir_op_iadd:
@@ -1016,65 +1068,29 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_seq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_slt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_feq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_flt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ieq:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ine:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ige:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_uge:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ilt:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
+                if (!ntq_emit_comparison(c, dest, instr, instr)) {
+                        fprintf(stderr, "Bad comparison instruction\n");
+                }
                 break;
 
         case nir_op_bcsel:
-                qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_NS(c, src[1], src[2]);
+                *dest = ntq_emit_bcsel(c, instr, src);
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]);
+                *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
                 break;
 
         case nir_op_frcp:
@@ -1789,6 +1805,56 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         if (stage == QSTAGE_FRAG)
                 vc4_nir_lower_blend(c);
 
+        struct nir_lower_tex_options tex_options = {
+                /* We would need to implement txs, but we don't want the
+                 * int/float conversions
+                 */
+                .lower_rect = false,
+
+                /* We want to use this, but we don't want to newton-raphson
+                 * its rcp.
+                 */
+                .lower_txp = false,
+
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
+         * The format swizzling applies before sRGB decode, and
+         * ARB_texture_swizzle is the last thing before returning the sample.
+         */
+        for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
+                enum pipe_format format = c->key->tex[i].format;
+
+                if (!format)
+                        continue;
+
+                const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
+
+                for (int j = 0; j < 4; j++) {
+                        uint8_t arb_swiz = c->key->tex[i].swizzle[j];
+
+                        if (arb_swiz <= 3) {
+                                tex_options.swizzles[i][j] =
+                                        format_swizzle[arb_swiz];
+                        } else {
+                                tex_options.swizzles[i][j] = arb_swiz;
+                        }
+
+                        /* If ARB_texture_swizzle is reading from the R, G, or
+                         * B channels of an sRGB texture, then we need to
+                         * apply sRGB decode to this channel at sample time.
+                         */
+                        if (arb_swiz < 3 && util_format_is_srgb(format)) {
+                                c->tex_srgb_decode[i] |= (1 << j);
+                        }
+
+                }
+        }
+
+        nir_lower_tex(c->s, &tex_options);
+
         if (c->fs_key && c->fs_key->light_twoside)
                 nir_lower_two_sided_color(c->s);
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index c6916c48e7e..efbb69b71a7 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -65,19 +65,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_XOR] = { "xor", 1, 2 },
         [QOP_NOT] = { "not", 1, 1 },
 
-        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1, false, true },
-        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true },
-        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true },
-        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true },
-        [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true },
-        [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true },
-        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true },
-        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true },
-        [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true },
-        [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true },
-
         [QOP_RCP] = { "rcp", 1, 1, false, true },
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
@@ -219,23 +206,8 @@ qir_is_tex(struct qinst *inst)
 bool
 qir_depends_on_flags(struct qinst *inst)
 {
-        switch (inst->op) {
-        case QOP_SEL_X_0_NS:
-        case QOP_SEL_X_0_NC:
-        case QOP_SEL_X_0_ZS:
-        case QOP_SEL_X_0_ZC:
-        case QOP_SEL_X_0_CS:
-        case QOP_SEL_X_0_CC:
-        case QOP_SEL_X_Y_NS:
-        case QOP_SEL_X_Y_NC:
-        case QOP_SEL_X_Y_ZS:
-        case QOP_SEL_X_Y_ZC:
-        case QOP_SEL_X_Y_CS:
-        case QOP_SEL_X_Y_CC:
-                return true;
-        default:
-                return false;
-        }
+        return (inst->cond != QPU_COND_ALWAYS &&
+                inst->cond != QPU_COND_NEVER);
 }
 
 bool
@@ -292,8 +264,19 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 void
 qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
 {
-        fprintf(stderr, "%s%s ",
+        static const char *conditions[] = {
+                [QPU_COND_ALWAYS] = "",
+                [QPU_COND_NEVER] = ".never",
+                [QPU_COND_ZS] = ".zs",
+                [QPU_COND_ZC] = ".zc",
+                [QPU_COND_NS] = ".ns",
+                [QPU_COND_NC] = ".nc",
+                [QPU_COND_CS] = ".cs",
+                [QPU_COND_CC] = ".cc",
+        };
+        fprintf(stderr, "%s%s%s ",
                 qir_get_op_name(inst->op),
+                conditions[inst->cond],
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
@@ -352,6 +335,7 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
         inst->src = calloc(2, sizeof(inst->src[0]));
         inst->src[0] = src0;
         inst->src[1] = src1;
+        inst->cond = QPU_COND_ALWAYS;
 
         return inst;
 }
@@ -503,9 +487,9 @@ qir_SF(struct vc4_compile *c, struct qreg src)
         if (!list_empty(&c->instructions))
                 last_inst = (struct qinst *)c->instructions.prev;
 
-        if (!last_inst ||
-            last_inst->dst.file != src.file ||
-            last_inst->dst.index != src.index ||
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index] ||
             qir_is_multi_instruction(last_inst)) {
                 src = qir_MOV(c, src);
                 last_inst = (struct qinst *)c->instructions.prev;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index b0fbb4c1db2..4ab4d35d0ca 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -93,23 +93,6 @@ enum qop {
         QOP_XOR,
         QOP_NOT,
 
-        /* Note: Orderings of these compares must be the same as in
-         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
-         * otherwise 0. */
-        QOP_SEL_X_0_ZS,
-        QOP_SEL_X_0_ZC,
-        QOP_SEL_X_0_NS,
-        QOP_SEL_X_0_NC,
-        QOP_SEL_X_0_CS,
-        QOP_SEL_X_0_CC,
-        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
-        QOP_SEL_X_Y_ZS,
-        QOP_SEL_X_Y_ZC,
-        QOP_SEL_X_Y_NS,
-        QOP_SEL_X_Y_NC,
-        QOP_SEL_X_Y_CS,
-        QOP_SEL_X_Y_CC,
-
         QOP_FTOI,
         QOP_ITOF,
         QOP_RCP,
@@ -170,6 +153,7 @@ struct qinst {
         struct qreg dst;
         struct qreg *src;
         bool sf;
+        uint8_t cond;
 };
 
 enum qstage {
@@ -385,6 +369,11 @@ struct vc4_compile {
 
         uint8_t vattr_sizes[8];
 
+        /* Bitfield for whether a given channel of a sampler needs sRGB
+         * decode.
+         */
+        uint8_t tex_srgb_decode[VC4_MAX_TEXTURE_SAMPLERS];
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
@@ -463,9 +452,11 @@ void qir_schedule_instructions(struct vc4_compile *c);
 void qir_reorder_uniforms(struct vc4_compile *c);
 
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
-static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+static inline struct qinst *
+qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
 {
         list_addtail(&inst->link, &c->instructions);
+        return inst;
 }
 
 struct qreg qir_get_temp(struct vc4_compile *c);
@@ -536,11 +527,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
         qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
         return t;                                                        \
 }                                                                        \
-static inline void                                                       \
+static inline struct qinst *                                             \
 qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
                   struct qreg a)                                         \
 {                                                                        \
-        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return qir_emit_nodef(c, qir_inst(QOP_##name, dest, a,           \
+                                          c->undef));                    \
 }
 
 #define QIR_ALU2(name)                                                   \
@@ -592,18 +584,6 @@ QIR_ALU2(V8MAX)
 QIR_ALU2(V8ADDS)
 QIR_ALU2(V8SUBS)
 QIR_ALU2(MUL24)
-QIR_ALU1(SEL_X_0_ZS)
-QIR_ALU1(SEL_X_0_ZC)
-QIR_ALU1(SEL_X_0_NS)
-QIR_ALU1(SEL_X_0_NC)
-QIR_ALU1(SEL_X_0_CS)
-QIR_ALU1(SEL_X_0_CC)
-QIR_ALU2(SEL_X_Y_ZS)
-QIR_ALU2(SEL_X_Y_ZC)
-QIR_ALU2(SEL_X_Y_NS)
-QIR_ALU2(SEL_X_Y_NC)
-QIR_ALU2(SEL_X_Y_CS)
-QIR_ALU2(SEL_X_Y_CC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
@@ -648,6 +628,17 @@ QIR_NODST_1(TLB_STENCIL_SETUP)
 QIR_NODST_1(MS_MASK)
 
 static inline struct qreg
+qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
+{
+        struct qreg t = qir_get_temp(c);
+        struct qinst *a = qir_MOV_dest(c, t, src0);
+        struct qinst *b = qir_MOV_dest(c, t, src1);
+        a->cond = cond;
+        b->cond = cond ^ 1;
+        return t;
+}
+
+static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
         struct qreg t = qir_FMOV(c, src);
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
index d20815f055e..2f280c54523 100644
--- a/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -250,12 +250,11 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
         else if (inst->dst.file == QFILE_TEMP)
                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
 
+        if (qir_depends_on_flags(inst))
+                add_dep(dir, state->last_sf, n);
+
         if (inst->sf)
                 add_write_dep(dir, &state->last_sf, n);
-
-        if (qir_depends_on_flags(inst)) {
-                add_dep(dir, state->last_sf, n);
-        }
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index cb4e0cfcc3f..b06702afea2 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -64,6 +64,12 @@ set_last_cond_add(struct vc4_compile *c, uint32_t cond)
         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
 }
 
+static void
+set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
+}
+
 /**
  * Some special registers can be read from either file, which lets us resolve
  * raddr conflicts without extra MOVs.
@@ -306,42 +312,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
                 }
 
-                switch (qinst->op) {
-                case QOP_SEL_X_0_ZS:
-                case QOP_SEL_X_0_ZC:
-                case QOP_SEL_X_0_NS:
-                case QOP_SEL_X_0_NC:
-                case QOP_SEL_X_0_CS:
-                case QOP_SEL_X_0_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]) | unpack);
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
-                                              1) + QPU_COND_ZS);
-                        break;
-
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        if (qinst->src[0].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        if (qinst->src[1].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
-                                              1) + QPU_COND_ZS);
-
-                        break;
+                bool handled_qinst_cond = true;
 
+                switch (qinst->op) {
                 case QOP_RCP:
                 case QOP_RSQ:
                 case QOP_EXP2:
@@ -497,16 +470,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_mul(c, qinst->cond);
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_add(c, qinst->cond);
                         }
+                        handled_qinst_cond = true;
                         set_last_dst_pack(c, qinst);
 
                         break;
                 }
 
+                assert(qinst->cond == QPU_COND_ALWAYS ||
+                       handled_qinst_cond);
+
                 if (qinst->sf) {
                         assert(!qir_is_multi_instruction(qinst));
                         *last_inst(c) |= QPU_SF;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 9e6678a0625..036da329987 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
 
         void *data;
         struct pipe_resource *shadow_rsc = NULL;
-        u_upload_alloc(vc4->uploader, 0, count * 2,
+        u_upload_alloc(vc4->uploader, 0, count * 2, 4,
                        shadow_offset, &shadow_rsc, &data);
         uint16_t *dst = data;
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 8ddf0865d21..0e289432bbe 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -100,6 +100,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TWO_SIDED_STENCIL:
         case PIPE_CAP_USER_INDEX_BUFFERS:
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -128,7 +129,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
         case PIPE_CAP_CUBE_MAP_ARRAY:
         case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-        case PIPE_CAP_TEXTURE_SWIZZLE:
         case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
         case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
         case PIPE_CAP_SEAMLESS_CUBE_MAP:
@@ -171,6 +171,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXEL_OFFSET:
         case PIPE_CAP_MAX_VERTEX_STREAMS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -180,15 +182,20 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-	case PIPE_CAP_DEPTH_BOUNDS_TEST:
-	case PIPE_CAP_TGSI_TXQS:
-	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-	case PIPE_CAP_SHAREABLE_SHADERS:
-	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-	case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
+        case PIPE_CAP_TGSI_TXQS:
+        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
                 return 0;
 
                 /* Stream output. */
@@ -345,6 +352,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return 0;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 527f7637cb6..c322503d816 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -605,7 +605,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx,
            ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
 
            if (ib.user_buffer) {
-                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size,
+                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256,
                                  ib.user_buffer, &ib.offset, &ib.buffer);
                    ib.user_buffer = NULL;
            }
@@ -948,8 +948,8 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
                     16, UTIL_SLAB_SINGLETHREADED);
 
    vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
-   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256,
-                                     PIPE_BIND_INDEX_BUFFER);
+   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024,
+                                     PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
    if (!vctx->uploader)
            goto fail;
 
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 26a4f7736e3..e8d82b37c0f 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -201,6 +201,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -219,6 +221,11 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;