43 files changed, 2324 insertions, 1528 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3f62ce21a9f..f4a57ba3404 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
@@ -30,10 +28,10 @@ SIM_LDFLAGS = -lsimpenrose
 endif
 
 AM_CFLAGS = \
+	-I$(top_builddir)/src/glsl/nir \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
 	$()
 
 noinst_LTLIBRARIES = libvc4.la
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 1eb029e67e7..6fb40c20562 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,8 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_blend.c \
+	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
@@ -49,4 +51,5 @@ C_SOURCES := \
 	vc4_state.c \
 	vc4_tiling.c \
 	vc4_tiling.h \
+	vc4_uniforms.c \
 	$()
diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 1fd8aa9fb28..ffc973735ae 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -26,17 +26,6 @@
 
 #include "vc4_simulator_validate.h"
 
-enum vc4_bo_mode {
-	VC4_MODE_UNDECIDED,
-	VC4_MODE_RENDER,
-	VC4_MODE_SHADER,
-};
-
-struct vc4_bo_exec_state {
-	struct drm_gem_cma_object *bo;
-	enum vc4_bo_mode mode;
-};
-
 struct vc4_exec_info {
 	/* Sequence number for this bin/render job. */
 	uint64_t seqno;
@@ -47,7 +36,7 @@ struct vc4_exec_info {
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
-	struct vc4_bo_exec_state *bo;
+	struct drm_gem_cma_object **bo;
 	uint32_t bo_count;
 
 	/* List of other BOs used in the job that need to be released
@@ -72,7 +61,6 @@ struct vc4_exec_info {
 	 * command lists.
 	 */
 	struct vc4_shader_state {
-		uint8_t packet;
 		uint32_t addr;
 		/* Maximum vertex index referenced by any primitive using this
 		 * shader state.
@@ -88,6 +76,7 @@ struct vc4_exec_info {
 	bool found_tile_binning_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
+	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
 	struct drm_gem_cma_object *tile_bo;
 	uint32_t tile_alloc_offset;
@@ -99,6 +88,9 @@ struct vc4_exec_info {
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
 
+	/* Pointer to the unvalidated bin CL (if present). */
+	void *bin_u;
+
 	/* Pointers to the shader recs.  These paddr gets incremented as CL
 	 * packets are relocated in validate_gl_shader_state, and the vaddrs
 	 * (u and v) get incremented and size decremented as the shader recs
@@ -168,10 +160,8 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
-bool vc4_use_bo(struct vc4_exec_info *exec,
-		uint32_t hindex,
-		enum vc4_bo_mode mode,
-		struct drm_gem_cma_object **obj);
+struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec,
+				      uint32_t hindex);
 
 int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index e4b7fea5968..93f9ec7ed9b 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
 
+	exec->bin_u = bin;
+
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
 	exec->shader_rec_size = args->shader_rec_size;
diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 88cfc0fa9f0..771e2b78761 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -88,16 +88,22 @@ enum vc4_packet {
 #define VC4_PACKET_START_TILE_BINNING_SIZE				1
 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
 #define VC4_PACKET_GL_SHADER_STATE_SIZE					5
 #define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
 #define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
 #define VC4_PACKET_POINT_SIZE_SIZE					5
@@ -106,6 +112,7 @@ enum vc4_packet {
 #define VC4_PACKET_DEPTH_OFFSET_SIZE					5
 #define VC4_PACKET_CLIP_WINDOW_SIZE					9
 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
@@ -136,6 +143,16 @@ enum vc4_packet {
 
 /** @{
  *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
+/** @{
+ *
  * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address)
  */
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index e2d907ad91f..b827eb7e9e1 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		      struct vc4_rcl_setup *setup,
 		      uint8_t x, uint8_t y, bool first, bool last)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
 
 	/* Note that the load doesn't actually occur until the
 	 * tile coords packet is processed, and only one load
@@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 */
 	if (setup->color_read) {
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u16(setup, args->color_read.bits);
 		rcl_u32(setup,
-			setup->color_read->paddr +
-			exec->args->color_read.offset);
+			setup->color_read->paddr + args->color_read.offset);
 	}
 
 	if (setup->zs_read) {
@@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		}
 
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_read.bits);
-		rcl_u32(setup,
-			setup->zs_read->paddr + exec->args->zs_read.offset);
+		rcl_u16(setup, args->zs_read.bits);
+		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
 	}
 
 	/* Clipping depends on tile coordinates having been
@@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (setup->zs_write) {
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_write.bits |
+		rcl_u16(setup, args->zs_write.bits |
 			(setup->color_ms_write ?
 			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
 		rcl_u32(setup,
-			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			(setup->zs_write->paddr + args->zs_write.offset) |
 			((last && !setup->color_ms_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}
@@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec,
 static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 			     struct vc4_rcl_setup *setup)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
-	uint8_t min_x_tile = exec->args->min_x_tile;
-	uint8_t min_y_tile = exec->args->min_y_tile;
-	uint8_t max_x_tile = exec->args->max_x_tile;
-	uint8_t max_y_tile = exec->args->max_y_tile;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
 	uint8_t xtiles = max_x_tile - min_x_tile + 1;
 	uint8_t ytiles = max_y_tile - min_y_tile + 1;
 	uint8_t x, y;
@@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
 	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
 
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		size += VC4_PACKET_CLEAR_COLORS_SIZE +
 			VC4_PACKET_TILE_COORDINATES_SIZE +
 			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
@@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}
 
 	if (setup->zs_write)
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
 	if (setup->color_ms_write) {
 		if (setup->zs_write)
 			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
@@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	rcl_u32(setup,
 		(setup->color_ms_write ?
 		 (setup->color_ms_write->paddr +
-		  exec->args->color_ms_write.offset) :
+		  args->color_ms_write.offset) :
 		 0));
-	rcl_u16(setup, exec->args->width);
-	rcl_u16(setup, exec->args->height);
-	rcl_u16(setup, exec->args->color_ms_write.bits);
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_ms_write.bits);
 
 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
 	 * stale clear values in it, so we have to do a store in None mode (no
 	 * writes) so that we trigger the tile buffer clear.
 	 */
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
-		rcl_u32(setup, exec->args->clear_color[0]);
-		rcl_u32(setup, exec->args->clear_color[1]);
-		rcl_u32(setup, exec->args->clear_z);
-		rcl_u8(setup, exec->args->clear_s);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
 
 		vc4_tile_coordinates(setup, 0, 0);
 
@@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
@@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (tiling > VC4_TILING_FORMAT_LT) {
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index a0b67a7e50b..b248831113c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,42 +94,42 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-bool
-vc4_use_bo(struct vc4_exec_info *exec,
-	   uint32_t hindex,
-	   enum vc4_bo_mode mode,
-	   struct drm_gem_cma_object **obj)
+struct drm_gem_cma_object *
+vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex)
 {
-	*obj = NULL;
+	struct drm_gem_cma_object *obj;
+	struct drm_vc4_bo *bo;
 
 	if (hindex >= exec->bo_count) {
 		DRM_ERROR("BO index %d greater than BO count %d\n",
 			  hindex, exec->bo_count);
-		return false;
+		return NULL;
 	}
+	obj = exec->bo[hindex];
+	bo = to_vc4_bo(&obj->base);
 
-	if (exec->bo[hindex].mode != mode) {
-		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
-			exec->bo[hindex].mode = mode;
-		} else {
-			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
-				  hindex, exec->bo[hindex].mode, mode);
-			return false;
-		}
+	if (bo->validated_shader) {
+		DRM_ERROR("Trying to use shader BO as something other than "
+			  "a shader\n");
+		return NULL;
 	}
 
-	*obj = exec->bo[hindex].bo;
-	return true;
+	return obj;
+}
+
+static struct drm_gem_cma_object *
+vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]);
 }
 
 static bool
-vc4_use_handle(struct vc4_exec_info *exec,
-	       uint32_t gem_handles_packet_index,
-	       enum vc4_bo_mode mode,
-	       struct drm_gem_cma_object **obj)
+validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos)
 {
-	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
-			  mode, obj);
+	/* Note that the untrusted pointer passed to these functions is
+	 * incremented past the packet byte.
+	 */
+	return (untrusted - 1 == exec->bin_u + pos);
 }
 
 static uint32_t
@@ -201,14 +201,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }
 
+
 static int
-validate_flush_all(VALIDATE_ARGS)
+validate_flush(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) {
+		DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
+	exec->found_flush = true;
 
 	return 0;
 }
@@ -233,17 +234,13 @@ validate_start_tile_binning(VALIDATE_ARGS)
 static int
 validate_increment_semaphore(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) {
+		DRM_ERROR("Bin CL must end with "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
 		return -EINVAL;
 	}
 	exec->found_increment_semaphore_packet = true;
 
-	/* Once we've found the semaphore increment, there should be one FLUSH
-	 * then the end of the command list.  The FLUSH actually triggers the
-	 * increment, so we only need to make sure there
-	 */
-
 	return 0;
 }
 
@@ -257,11 +254,6 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -272,7 +264,8 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	if (max_index > shader_state->max_index)
 		shader_state->max_index = max_index;
 
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+	ib = vc4_use_handle(exec, 0);
+	if (!ib)
 		return -EINVAL;
 
 	if (offset > ib->base.size ||
@@ -295,11 +288,6 @@ validate_gl_array_primitive(VALIDATE_ARGS)
 	uint32_t max_index;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -329,7 +317,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
 	exec->shader_state[i].addr = *(uint32_t *)untrusted;
 	exec->shader_state[i].max_index = 0;
 
@@ -348,31 +335,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 }
 
 static int
-validate_nv_shader_state(VALIDATE_ARGS)
-{
-	uint32_t i = exec->shader_state_count++;
-
-	if (i >= exec->shader_state_size) {
-		DRM_ERROR("More requests for shader states than declared\n");
-		return -EINVAL;
-	}
-
-	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
-	exec->shader_state[i].addr = *(uint32_t *)untrusted;
-
-	if (exec->shader_state[i].addr & 15) {
-		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
-			  exec->shader_state[i].addr);
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = (exec->shader_state[i].addr +
-				  exec->shader_rec_p);
-
-	return 0;
-}
-
-static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_device *dev = exec->exec_bo->base.dev;
@@ -473,8 +435,8 @@ static const struct cmd_info {
 } cmd_info[] = {
 	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
 	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
@@ -488,7 +450,7 @@ static const struct cmd_info {
 	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
 
 	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+	/* We don't support validating NV shader states. */
 
 	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
@@ -525,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		u8 cmd = *(uint8_t *)src_pkt;
 		const struct cmd_info *info;
 
-		if (cmd > ARRAY_SIZE(cmd_info)) {
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
 			DRM_ERROR("0x%08x: packet %d out of bounds\n",
 				  src_offset, cmd);
 			return -EINVAL;
@@ -580,8 +542,16 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	/* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH.  The
+	 * semaphore is used to trigger the render CL to start up, and the
+	 * FLUSH is what caps the bin lists with
+	 * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main
+	 * render CL when they get called to) and actually triggers the queued
+	 * semaphore increment.
+	 */
+	if (!exec->found_increment_semaphore_packet || !exec->found_flush) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + "
+			  "VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
 
@@ -612,18 +582,19 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t cube_map_stride = 0;
 	enum vc4_texture_data_type type;
 
-	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+	tex = vc4_use_bo(exec, texture_handle_index);
+	if (!tex)
 		return false;
 
 	if (sample->is_direct) {
 		uint32_t remaining_size = tex->base.size - p0;
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
-			return false;
+			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
 			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
-			return false;
+			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
 		return true;
@@ -642,14 +613,14 @@ reloc_tex(struct vc4_exec_info *exec,
 		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
-				return false;
+				goto fail;
 			}
 
 			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
-			return false;
+			goto fail;
 		}
 	}
 
@@ -683,7 +654,7 @@ reloc_tex(struct vc4_exec_info *exec,
 	case VC4_TEXTURE_TYPE_YUV422R:
 	default:
 		DRM_ERROR("Texture format %d unsupported\n", type);
-		return false;
+		goto fail;
 	}
 	utile_w = utile_width(cpp);
 	utile_h = utile_height(cpp);
@@ -699,7 +670,7 @@ reloc_tex(struct vc4_exec_info *exec,
 
 	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
 				tiling_format, width, height, cpp)) {
-		return false;
+		goto fail;
 	}
 
 	/* The mipmap levels are stored before the base of the texture.  Make
@@ -740,7 +711,7 @@ reloc_tex(struct vc4_exec_info *exec,
 				  i, level_width, level_height,
 				  aligned_width, aligned_height,
 				  level_size, offset);
-			return false;
+			goto fail;
 		}
 
 		offset -= level_size;
@@ -749,54 +720,37 @@ reloc_tex(struct vc4_exec_info *exec,
 	*validated_p0 = tex->paddr + p0;
 
 	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
 }
 
 static int
-validate_shader_rec(struct drm_device *dev,
-		    struct vc4_exec_info *exec,
-		    struct vc4_shader_state *state)
+validate_gl_shader_rec(struct drm_device *dev,
+		       struct vc4_exec_info *exec,
+		       struct vc4_shader_state *state)
 {
 	uint32_t *src_handles;
 	void *pkt_u, *pkt_v;
-	enum shader_rec_reloc_type {
-		RELOC_CODE,
-		RELOC_VBO,
-	};
-	struct shader_rec_reloc {
-		enum shader_rec_reloc_type type;
-		uint32_t offset;
-	};
-	static const struct shader_rec_reloc gl_relocs[] = {
-		{ RELOC_CODE, 4 },  /* fs */
-		{ RELOC_CODE, 16 }, /* vs */
-		{ RELOC_CODE, 28 }, /* cs */
+	static const uint32_t shader_reloc_offsets[] = {
+		4, /* fs */
+		16, /* vs */
+		28, /* cs */
 	};
-	static const struct shader_rec_reloc nv_relocs[] = {
-		{ RELOC_CODE, 4 }, /* fs */
-		{ RELOC_VBO, 12 }
-	};
-	const struct shader_rec_reloc *relocs;
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
-	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets);
+	struct drm_gem_cma_object *bo[shader_reloc_count + 8];
+	uint32_t nr_attributes, nr_relocs, packet_size;
 	int i;
-	struct vc4_validated_shader_info *validated_shader = NULL;
-
-	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		relocs = nv_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
 
-		packet_size = 16;
-	} else {
-		relocs = gl_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
-
-		nr_attributes = state->addr & 0x7;
-		if (nr_attributes == 0)
-			nr_attributes = 8;
-		packet_size = gl_shader_rec_size(state->addr);
-	}
-	nr_relocs = nr_fixed_relocs + nr_attributes;
+	nr_attributes = state->addr & 0x7;
+	if (nr_attributes == 0)
+		nr_attributes = 8;
+	packet_size = gl_shader_rec_size(state->addr);
 
+	nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes;
 	if (nr_relocs * 4 > exec->shader_rec_size) {
 		DRM_ERROR("overflowed shader recs reading %d handles "
 			  "from %d bytes left\n",
@@ -826,21 +780,30 @@ validate_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	for (i = 0; i < nr_relocs; i++) {
-		enum vc4_bo_mode mode;
-
-		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
-			mode = VC4_MODE_SHADER;
-		else
-			mode = VC4_MODE_RENDER;
+	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
+		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
+		return -EINVAL;
+	}
 
-		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) {
-			return false;
+	for (i = 0; i < shader_reloc_count; i++) {
+		if (src_handles[i] > exec->bo_count) {
+			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
+			return -EINVAL;
 		}
+
+		bo[i] = exec->bo[src_handles[i]];
+		if (!bo[i])
+			return -EINVAL;
+	}
+	for (i = shader_reloc_count; i < nr_relocs; i++) {
+		bo[i] = vc4_use_bo(exec, src_handles[i]);
+		if (!bo[i])
+			return -EINVAL;
 	}
 
-	for (i = 0; i < nr_fixed_relocs; i++) {
-		uint32_t o = relocs[i].offset;
+	for (i = 0; i < shader_reloc_count; i++) {
+		struct vc4_validated_shader_info *validated_shader;
+		uint32_t o = shader_reloc_offsets[i];
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
 		void *uniform_data_u;
@@ -848,58 +811,50 @@ validate_shader_rec(struct drm_device *dev,
 
 		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
-		switch (relocs[i].type) {
-		case RELOC_CODE:
-			if (src_offset != 0) {
-				DRM_ERROR("Shaders must be at offset 0 of "
-					  "the BO.\n");
-				goto fail;
-			}
+		if (src_offset != 0) {
+			DRM_ERROR("Shaders must be at offset 0 of "
+				  "the BO.\n");
+			return -EINVAL;
+		}
 
-			kfree(validated_shader);
-			validated_shader = vc4_validate_shader(bo[i]);
-			if (!validated_shader)
-				goto fail;
+		validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader;
+		if (!validated_shader)
+			return -EINVAL;
 
-			if (validated_shader->uniforms_src_size >
-			    exec->uniforms_size) {
-				DRM_ERROR("Uniforms src buffer overflow\n");
-				goto fail;
-			}
+		if (validated_shader->uniforms_src_size >
+		    exec->uniforms_size) {
+			DRM_ERROR("Uniforms src buffer overflow\n");
+			return -EINVAL;
+		}
 
-			texture_handles_u = exec->uniforms_u;
-			uniform_data_u = (texture_handles_u +
-					  validated_shader->num_texture_samples);
-
-			memcpy(exec->uniforms_v, uniform_data_u,
-			       validated_shader->uniforms_size);
-
-			for (tex = 0;
-			     tex < validated_shader->num_texture_samples;
-			     tex++) {
-				if (!reloc_tex(exec,
-					       uniform_data_u,
-					       &validated_shader->texture_samples[tex],
-					       texture_handles_u[tex])) {
-					goto fail;
-				}
-			}
+		texture_handles_u = exec->uniforms_u;
+		uniform_data_u = (texture_handles_u +
+				  validated_shader->num_texture_samples);
 
-			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+		memcpy(exec->uniforms_v, uniform_data_u,
+		       validated_shader->uniforms_size);
 
-			exec->uniforms_u += validated_shader->uniforms_src_size;
-			exec->uniforms_v += validated_shader->uniforms_size;
-			exec->uniforms_p += validated_shader->uniforms_size;
+		for (tex = 0;
+		     tex < validated_shader->num_texture_samples;
+		     tex++) {
+			if (!reloc_tex(exec,
+				       uniform_data_u,
+				       &validated_shader->texture_samples[tex],
+				       texture_handles_u[tex])) {
+				return -EINVAL;
+			}
+		}
 
-			break;
+		*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
 
-		case RELOC_VBO:
-			break;
-		}
+		exec->uniforms_u += validated_shader->uniforms_src_size;
+		exec->uniforms_v += validated_shader->uniforms_size;
+		exec->uniforms_p += validated_shader->uniforms_size;
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
-		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		struct drm_gem_cma_object *vbo =
+			bo[ARRAY_SIZE(shader_reloc_offsets) + i];
 		uint32_t o = 36 + i * 8;
 		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
 		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
@@ -929,13 +884,7 @@ validate_shader_rec(struct drm_device *dev,
 		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
 	}
 
-	kfree(validated_shader);
-
 	return 0;
-
-fail:
-	kfree(validated_shader);
-	return -EINVAL;
 }
 
 int
@@ -946,7 +895,7 @@ vc4_validate_shader_recs(struct drm_device *dev,
 	int ret = 0;
 
 	for (i = 0; i < exec->shader_state_count; i++) {
-		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]);
 		if (ret)
 			return ret;
 	}
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index d29e2c9c318..e52a1941730 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
         struct vc4_context *vc4 = vc4_context(ctx);
 
         if (!util_blitter_is_blit_supported(vc4->blitter, info)) {
-                fprintf(stderr, "blit unsupported %s -> %s",
+                fprintf(stderr, "blit unsupported %s -> %s\n",
                     util_format_short_name(info->src.resource->format),
                     util_format_short_name(info->dst.resource->format));
                 return false;
@@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
             info.dst.resource->nr_samples <= 1 &&
             !util_format_is_depth_or_stencil(info.src.resource->format) &&
             !util_format_is_pure_integer(info.src.resource->format)) {
-                fprintf(stderr, "color resolve unimplemented");
+                fprintf(stderr, "color resolve unimplemented\n");
                 return;
         }
 
@@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
         }
 
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping");
+                fprintf(stderr, "cannot blit stencil, skipping\n");
                 info.mask &= ~PIPE_MASK_S;
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index cbdb9e89cf6..f7b41f5816d 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Broadcom
+ * Copyright © 2014-2015 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                  * allocate something new instead, since we assume that the
                  * user will proceed to CPU map it and fill it with stuff.
                  */
-                if (!vc4_bo_wait(bo, 0)) {
+                if (!vc4_bo_wait(bo, 0, NULL)) {
                         pipe_mutex_unlock(cache->lock);
                         return NULL;
                 }
@@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
 }
 
 struct vc4_bo *
-vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size,
-                 const char *name)
+vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
 {
-        void *map;
         struct vc4_bo *bo;
+        int ret;
+
+        bo = CALLOC_STRUCT(vc4_bo);
+        if (!bo)
+                return NULL;
+
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->size = align(size, 4096);
+        bo->name = "code";
+        bo->private = false; /* Make sure it doesn't go back to the cache. */
+
+        if (!using_vc4_simulator) {
+                struct drm_vc4_create_shader_bo create = {
+                        .size = size,
+                        .data = (uintptr_t)data,
+                };
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+                               &create);
+                bo->handle = create.handle;
+        } else {
+                struct drm_mode_create_dumb create;
+                memset(&create, 0, sizeof(create));
+
+                create.width = 128;
+                create.bpp = 8;
+                create.height = (size + 127) / 128;
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+                bo->handle = create.handle;
+                assert(create.size >= size);
+
+                vc4_bo_map(bo);
+                memcpy(bo->map, data, size);
+        }
+        if (ret != 0) {
+                fprintf(stderr, "create shader ioctl failure\n");
+                abort();
+        }
+
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
 
-        bo = vc4_bo_alloc(screen, size, name);
-        map = vc4_bo_map(bo);
-        memcpy(map, data, size);
         return bo;
 }
 
@@ -413,63 +455,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         return true;
 }
 
+static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_seqno wait = {
+                .seqno = seqno,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
+}
+
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason)
 {
         if (screen->finished_seqno >= seqno)
                 return true;
 
-        struct drm_vc4_wait_seqno wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.seqno = seqno;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
-        else {
-                wait.seqno = screen->finished_seqno;
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on seqno %lld for %s\n",
+                                (long long)seqno, reason);
+                }
         }
 
-        if (ret == 0) {
-                screen->finished_seqno = wait.seqno;
-                return true;
-        }
+        int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+                return false;
         }
 
-        return false;
+        screen->finished_seqno = seqno;
+        return true;
+}
+
+static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_bo wait = {
+                .handle = handle,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
 }
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason)
 {
         struct vc4_screen *screen = bo->screen;
 
-        struct drm_vc4_wait_bo wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.handle = bo->handle;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
-        else
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on %s BO for %s\n",
+                                bo->name, reason);
+                }
+        }
 
-        if (ret == 0)
-                return true;
+        int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+                return false;
         }
 
-        return false;
+        return true;
 }
 
 void *
@@ -515,7 +585,7 @@ vc4_bo_map(struct vc4_bo *bo)
 {
         void *map = vc4_bo_map_unsynchronized(bo);
 
-        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE);
+        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
         if (!ok) {
                 fprintf(stderr, "BO wait for map failed\n");
                 abort();
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index 7320695ca8e..b77506e242a 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -58,8 +58,8 @@ struct vc4_bo {
 
 struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size,
                             const char *name);
-struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data,
-                                uint32_t size, const char *name);
+struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data,
+                                   uint32_t size);
 void vc4_bo_last_unreference(struct vc4_bo *bo);
 void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
 struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
@@ -113,10 +113,11 @@ void *
 vc4_bo_map_unsynchronized(struct vc4_bo *bo);
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns);
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason);
 
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns);
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason);
 
 void
 vc4_bufmgr_destroy(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 0700e885cbf..ced4f2dfa86 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
 void
 cl_ensure_space(struct vc4_cl *cl, uint32_t space)
 {
-        if ((cl->next - cl->base) + space <= cl->size)
+        uint32_t offset = cl_offset(cl);
+
+        if (offset + space <= cl->size)
                 return;
 
         uint32_t size = MAX2(cl->size + space, cl->size * 2);
-        uint32_t offset = cl->next -cl->base;
 
         cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
         cl->size = size;
@@ -60,15 +61,20 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo)
         uint32_t hindex;
         uint32_t *current_handles = vc4->bo_handles.base;
 
-        for (hindex = 0;
-             hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4;
-             hindex++) {
+        for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) {
                 if (current_handles[hindex] == bo->handle)
                         return hindex;
         }
 
-        cl_u32(&vc4->bo_handles, bo->handle);
-        cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo));
+        struct vc4_cl_out *out;
+
+        out = cl_start(&vc4->bo_handles);
+        cl_u32(&out, bo->handle);
+        cl_end(&vc4->bo_handles, out);
+
+        out = cl_start(&vc4->bo_pointers);
+        cl_ptr(&out, vc4_bo_reference(bo));
+        cl_end(&vc4->bo_pointers, out);
 
         return hindex;
 }
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 4a50e790942..bf4be0efc29 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -33,12 +33,20 @@
 
 struct vc4_bo;
 
+/**
+ * Undefined structure, used for typechecking that you're passing the pointers
+ * to these functions correctly.
+ */
+struct vc4_cl_out;
+
 struct vc4_cl {
         void *base;
-        void *next;
+        struct vc4_cl_out *next;
+        struct vc4_cl_out *reloc_next;
         uint32_t size;
-        uint32_t reloc_next;
+#ifdef DEBUG
         uint32_t reloc_count;
+#endif
 };
 
 void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
@@ -49,135 +57,149 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
 struct PACKED unaligned_16 { uint16_t x; };
 struct PACKED unaligned_32 { uint32_t x; };
 
-static inline void
-put_unaligned_32(void *ptr, uint32_t val)
+static inline uint32_t cl_offset(struct vc4_cl *cl)
 {
-        struct unaligned_32 *p = ptr;
-        p->x = val;
+        return (char *)cl->next - (char *)cl->base;
 }
 
 static inline void
-put_unaligned_16(void *ptr, uint16_t val)
+cl_advance(struct vc4_cl_out **cl, uint32_t n)
 {
-        struct unaligned_16 *p = ptr;
-        p->x = val;
+        (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n);
 }
 
-static inline void
-cl_u8(struct vc4_cl *cl, uint8_t n)
+static inline struct vc4_cl_out *
+cl_start(struct vc4_cl *cl)
 {
-        assert((cl->next - cl->base) + 1 <= cl->size);
-
-        *(uint8_t *)cl->next = n;
-        cl->next++;
+        return cl->next;
 }
 
 static inline void
-cl_u16(struct vc4_cl *cl, uint16_t n)
+cl_end(struct vc4_cl *cl, struct vc4_cl_out *next)
 {
-        assert((cl->next - cl->base) + 2 <= cl->size);
+        cl->next = next;
+        assert(cl_offset(cl) <= cl->size);
+}
 
-        put_unaligned_16(cl->next, n);
-        cl->next += 2;
+
+static inline void
+put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val)
+{
+        struct unaligned_32 *p = (void *)ptr;
+        p->x = val;
 }
 
 static inline void
-cl_u32(struct vc4_cl *cl, uint32_t n)
+put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        struct unaligned_16 *p = (void *)ptr;
+        p->x = val;
+}
 
-        put_unaligned_32(cl->next, n);
-        cl->next += 4;
+static inline void
+cl_u8(struct vc4_cl_out **cl, uint8_t n)
+{
+        *(uint8_t *)(*cl) = n;
+        cl_advance(cl, 1);
 }
 
 static inline void
-cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
+cl_u16(struct vc4_cl_out **cl, uint16_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
+        put_unaligned_16(*cl, n);
+        cl_advance(cl, 2);
+}
 
-        *(uint32_t *)cl->next = n;
-        cl->next += 4;
+static inline void
+cl_u32(struct vc4_cl_out **cl, uint32_t n)
+{
+        put_unaligned_32(*cl, n);
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_ptr(struct vc4_cl *cl, void *ptr)
+cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
+        *(uint32_t *)(*cl) = n;
+        cl_advance(cl, 4);
+}
 
-        *(void **)cl->next = ptr;
-        cl->next += sizeof(void *);
+static inline void
+cl_ptr(struct vc4_cl_out **cl, void *ptr)
+{
+        *(struct vc4_cl_out **)(*cl) = ptr;
+        cl_advance(cl, sizeof(void *));
 }
 
 static inline void
-cl_f(struct vc4_cl *cl, float f)
+cl_f(struct vc4_cl_out **cl, float f)
 {
         cl_u32(cl, fui(f));
 }
 
 static inline void
-cl_aligned_f(struct vc4_cl *cl, float f)
+cl_aligned_f(struct vc4_cl_out **cl, float f)
 {
         cl_aligned_u32(cl, fui(f));
 }
 
 static inline void
-cl_start_reloc(struct vc4_cl *cl, uint32_t n)
+cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
+#endif
 
-        cl_u8(cl, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = cl->next - cl->base;
-        cl_u32(cl, 0); /* Space where hindex will be written. */
-        cl_u32(cl, 0); /* Space where hindex will be written. */
+        cl_u8(out, VC4_PACKET_GEM_HANDLES);
+        cl->reloc_next = *out;
+        cl_u32(out, 0); /* Space where hindex will be written. */
+        cl_u32(out, 0); /* Space where hindex will be written. */
 }
 
-static inline void
+static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
-        cl->reloc_next = cl->next - cl->base;
+#endif
+        cl->reloc_next = cl->next;
+
+        /* Reserve the space where hindex will be written. */
+        cl_advance(&cl->next, n * 4);
 
-        /* Space where hindex will be written. */
-        cl->next += n * 4;
+        return cl->next;
 }
 
 static inline void
-cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
+         struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
-        cl_u32(cl, offset);
+        cl_u32(cl_out, offset);
 }
 
 static inline void
-cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
+cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+                 struct vc4_cl_out **cl_out,
+                 struct vc4_bo *bo, uint32_t offset)
 {
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
 
+#ifdef DEBUG
         cl->reloc_count--;
+#endif
 
-        cl_aligned_u32(cl, offset);
-}
-
-static inline void
-cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
-}
-
-static inline void
-cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
-{
-        cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
+        cl_aligned_u32(cl_out, offset);
 }
 
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 69055081daa..6d748010baf 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset)
         void *f = cl + offset;
 
         fprintf(stderr, "0x%08x 0x%08x:      %f (0x%08x)\n",
-                offset, hw_offset, *(float *)f, *(uint32_t *)f);
+                offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f);
 }
 
 static void
@@ -47,7 +47,33 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset
 }
 
 static void
-dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t bits = *(uint32_t *)(cl + offset);
+
+        fprintf(stderr, "0x%08x 0x%08x:      addr 0x%08x%s%s%s%s\n",
+                offset, hw_offset,
+                bits & ~0xf,
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color",
+                (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : "");
+}
+
+static void
+dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint8_t *bytes = cl + offset;
         uint32_t *addr = cl + offset + 2;
@@ -125,6 +151,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw
 }
 
 static void
+dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *bits = cl + offset;
@@ -291,63 +329,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset)
                 offset, hw_offset, handles[0], handles[1]);
 }
 
-#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name }
-#define PACKET(name, size) [name] = { #name, size, NULL }
+#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name }
+#define PACKET(name) [name] = { #name, name ## _SIZE, NULL }
 
 static const struct packet_info {
         const char *name;
         uint8_t size;
         void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset);
 } packet_info[] = {
-        PACKET(VC4_PACKET_HALT, 1),
-        PACKET(VC4_PACKET_NOP, 1),
-
-        PACKET(VC4_PACKET_FLUSH, 1),
-        PACKET(VC4_PACKET_FLUSH_ALL, 1),
-        PACKET(VC4_PACKET_START_TILE_BINNING, 1),
-        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1),
-        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1),
-
-        PACKET(VC4_PACKET_BRANCH, 5),
-        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5),
-
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1),
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1),
-        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5),
-        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5),
-        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7),
-        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7),
-
-        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10),
-
-        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48),
-        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49),
-
-        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2),
-
-        PACKET(VC4_PACKET_GL_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_NV_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_VG_SHADER_STATE, 5),
-
-        PACKET(VC4_PACKET_CONFIGURATION_BITS, 4),
-        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5),
-        PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5),
-        PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5),
-        PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3),
-        PACKET(VC4_PACKET_DEPTH_OFFSET, 5),
-        PACKET(VC4_PACKET_CLIP_WINDOW, 9),
-        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5),
-        PACKET(VC4_PACKET_Z_CLIPPING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
-
-        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
-        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
-        PACKET(VC4_PACKET_CLEAR_COLORS, 14),
-        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
-
-        PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9),
+        PACKET(VC4_PACKET_HALT),
+        PACKET(VC4_PACKET_NOP),
+
+        PACKET(VC4_PACKET_FLUSH),
+        PACKET(VC4_PACKET_FLUSH_ALL),
+        PACKET(VC4_PACKET_START_TILE_BINNING),
+        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE),
+        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE),
+
+        PACKET(VC4_PACKET_BRANCH),
+        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST),
+
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF),
+        PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
+        PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
+
+        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
+        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
+
+        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
+        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
+
+        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT),
+
+        PACKET(VC4_PACKET_GL_SHADER_STATE),
+        PACKET(VC4_PACKET_NV_SHADER_STATE),
+        PACKET(VC4_PACKET_VG_SHADER_STATE),
+
+        PACKET(VC4_PACKET_CONFIGURATION_BITS),
+        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS),
+        PACKET_DUMP(VC4_PACKET_POINT_SIZE),
+        PACKET_DUMP(VC4_PACKET_LINE_WIDTH),
+        PACKET(VC4_PACKET_RHT_X_BOUNDARY),
+        PACKET(VC4_PACKET_DEPTH_OFFSET),
+        PACKET(VC4_PACKET_CLIP_WINDOW),
+        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET),
+        PACKET(VC4_PACKET_Z_CLIPPING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING),
+
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG),
+        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG),
+        PACKET(VC4_PACKET_CLEAR_COLORS),
+        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES),
+
+        PACKET_DUMP(VC4_PACKET_GEM_HANDLES),
 };
 
 void
@@ -359,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         while (offset < size) {
                 uint8_t header = cmds[offset];
 
-                if (header > ARRAY_SIZE(packet_info) ||
+                if (header >= ARRAY_SIZE(packet_info) ||
                     !packet_info[header].name) {
                         fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
                                 offset, hw_offset, header, header);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 630f8e68896..fff63158c9d 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx)
          * FLUSH completes.
          */
         cl_ensure_space(&vc4->bcl, 8);
-        cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
+        cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
-        cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
+        cl_u8(&bcl, VC4_PACKET_FLUSH);
+        cl_end(&vc4->bcl, bcl);
 
         if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                 pipe_surface_reference(&vc4->color_write, cbuf);
@@ -103,8 +105,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
         vc4_flush(pctx);
 
         if (fence) {
+                struct pipe_screen *screen = pctx->screen;
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
                                                        vc4->last_emit_seqno);
+                screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
 }
@@ -126,8 +130,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo)
          * they match.
          */
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 if (referenced_bos[i] == bo) {
                         return true;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d5d6be16f6e..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -67,7 +67,20 @@
 #define VC4_DIRTY_CLIP          (1 << 20)
 #define VC4_DIRTY_UNCOMPILED_VS (1 << 21)
 #define VC4_DIRTY_UNCOMPILED_FS (1 << 22)
-#define VC4_DIRTY_COMPILED_FS   (1 << 24)
+#define VC4_DIRTY_COMPILED_CS   (1 << 23)
+#define VC4_DIRTY_COMPILED_VS   (1 << 24)
+#define VC4_DIRTY_COMPILED_FS   (1 << 25)
+
+struct vc4_sampler_view {
+        struct pipe_sampler_view base;
+        uint32_t texture_p0;
+        uint32_t texture_p1;
+};
+
+struct vc4_sampler_state {
+        struct pipe_sampler_state base;
+        uint32_t texture_p1;
+};
 
 struct vc4_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
@@ -121,6 +134,12 @@ struct vc4_compiled_shader {
         struct vc4_ubo_range *ubo_ranges;
         uint32_t num_ubo_ranges;
         uint32_t ubo_size;
+        /**
+         * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the
+         * uniforms have to be rewritten (and therefore the shader state
+         * reemitted).
+         */
+        uint32_t uniform_dirty_bits;
 
         /** bitmask of which inputs are color inputs, for flat shade handling. */
         uint32_t color_inputs;
@@ -238,6 +257,11 @@ struct vc4_context {
          */
         bool draw_call_queued;
 
+        /** Maximum index buffer valid for the current shader_rec. */
+        uint32_t max_index;
+        /** Last index bias baked into the current shader_rec. */
+        uint32_t last_index_bias;
+
         struct primconvert_context *primconvert;
 
         struct hash_table *fs_cache, *vs_cache;
@@ -246,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
         uint8_t prim_mode;
@@ -326,6 +351,18 @@ vc4_context(struct pipe_context *pcontext)
         return (struct vc4_context *)pcontext;
 }
 
+static inline struct vc4_sampler_view *
+vc4_sampler_view(struct pipe_sampler_view *psview)
+{
+        return (struct vc4_sampler_view *)psview;
+}
+
+static inline struct vc4_sampler_state *
+vc4_sampler_state(struct pipe_sampler_state *psampler)
+{
+        return (struct vc4_sampler_state *)psampler;
+}
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv);
 void vc4_draw_init(struct pipe_context *pctx);
@@ -337,6 +374,7 @@ void vc4_simulator_init(struct vc4_screen *screen);
 int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args);
 
+void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
 void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_compiled_shader *shader,
                         struct vc4_constbuf_stateobj *cb,
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 5e6d70d6f33..a4e5e092b1a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t height = vc4->framebuffer.height;
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
 
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&vc4->bcl, tilew);
-        cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
+        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
+        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
+        cl_u8(&bcl, tilew);
+        cl_u8(&bcl, tileh);
+        cl_u8(&bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
          * figure out what new state packets need to be written to that tile's
          * command list.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING);
+        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);
 
         /* Reset the current compressed primitives format.  This gets modified
          * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
          * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
          * of every tile.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
-        cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
-                          VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
+        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
+        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
+                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
         vc4->draw_width = width;
         vc4->draw_height = height;
+
+        cl_end(&vc4->bcl, bcl);
 }
 
 static void
@@ -119,96 +122,67 @@ vc4_update_shadow_textures(struct pipe_context *pctx,
 }
 
 static void
-vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info)
 {
-        struct vc4_context *vc4 = vc4_context(pctx);
-
-        if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
-                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
-                util_primconvert_draw_vbo(vc4->primconvert, info);
-                perf_debug("Fallback conversion for %d %s vertices\n",
-                           info->count, u_prim_name(info->mode));
-                return;
-        }
-
-        /* Before setting up the draw, do any fixup blits necessary. */
-        vc4_update_shadow_textures(pctx, &vc4->verttex);
-        vc4_update_shadow_textures(pctx, &vc4->fragtex);
-
-        vc4_get_draw_cl_space(vc4);
-
+        /* VC4_DIRTY_VTXSTATE */
         struct vc4_vertex_stateobj *vtx = vc4->vtx;
+        /* VC4_DIRTY_VTXBUF */
         struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
 
-        if (vc4->prim_mode != info->mode) {
-                vc4->prim_mode = info->mode;
-                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
-        }
-
-        vc4_start_draw(vc4);
-        vc4_update_compiled_shaders(vc4, info->mode);
-
-        vc4_emit_state(pctx);
-        vc4->dirty = 0;
-
-        vc4_write_uniforms(vc4, vc4->prog.fs,
-                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
-                           &vc4->fragtex);
-        vc4_write_uniforms(vc4, vc4->prog.vs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-        vc4_write_uniforms(vc4, vc4->prog.cs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-
         /* The simulator throws a fit if VS or CS don't read an attribute, so
          * we emit a dummy read.
          */
         uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
         /* Emit the shader record. */
-        cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
-        cl_u16(&vc4->shader_rec,
+        struct vc4_cl_out *shader_rec =
+                cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
+        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
+        cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
+               VC4_SHADER_FLAG_FS_SINGLE_THREAD |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
-        cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
-        cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_FS */
+        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
+        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_VS */
+        cl_u16(&shader_rec, 0); /* vs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_CS */
+        cl_u16(&shader_rec, 0); /* cs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         uint32_t max_index = 0xffff;
-        uint32_t vpm_offset = 0;
         for (int i = 0; i < vtx->num_elements; i++) {
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
                 struct vc4_resource *rsc = vc4_resource(vb->buffer);
-                uint32_t offset = vb->buffer_offset + elem->src_offset;
+                /* not vc4->dirty tracked: vc4->last_index_bias */
+                uint32_t offset = (vb->buffer_offset +
+                                   elem->src_offset +
+                                   vb->stride * info->index_bias);
                 uint32_t vb_size = rsc->bo->size - offset;
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);
 
-                cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset);
-                cl_u8(&vc4->shader_rec, elem_size - 1);
-                cl_u8(&vc4->shader_rec, vb->stride);
-                cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]);
-                cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]);
-
-                vpm_offset += align(elem_size, 4);
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset);
+                cl_u8(&shader_rec, elem_size - 1);
+                cl_u8(&shader_rec, vb->stride);
+                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
+                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
 
                 if (vb->stride > 0) {
                         max_index = MIN2(max_index,
@@ -219,25 +193,89 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         if (vtx->num_elements == 0) {
                 assert(num_elements_emit == 1);
                 struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
-                cl_reloc(vc4, &vc4->shader_rec, bo, 0);
-                cl_u8(&vc4->shader_rec, 16 - 1); /* element size */
-                cl_u8(&vc4->shader_rec, 0); /* stride */
-                cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */
-                cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0);
+                cl_u8(&shader_rec, 16 - 1); /* element size */
+                cl_u8(&shader_rec, 0); /* stride */
+                cl_u8(&shader_rec, 0); /* VS VPM offset */
+                cl_u8(&shader_rec, 0); /* CS VPM offset */
                 vc4_bo_unreference(&bo);
         }
+        cl_end(&vc4->shader_rec, shader_rec);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         /* the actual draw call. */
-        cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE);
+        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
         assert(vtx->num_elements <= 8);
         /* Note that number of attributes == 0 in the packet means 8
          * attributes.  This field also contains the offset into shader_rec.
          */
-        cl_u32(&vc4->bcl, num_elements_emit & 0x7);
+        cl_u32(&bcl, num_elements_emit & 0x7);
+        cl_end(&vc4->bcl, bcl);
+
+        vc4_write_uniforms(vc4, vc4->prog.fs,
+                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
+                           &vc4->fragtex);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+        vc4_write_uniforms(vc4, vc4->prog.cs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+
+        vc4->last_index_bias = info->index_bias;
+        vc4->max_index = max_index;
+}
+
+static void
+vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (info->mode >= PIPE_PRIM_QUADS) {
+                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
+                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
+                util_primconvert_draw_vbo(vc4->primconvert, info);
+                perf_debug("Fallback conversion for %d %s vertices\n",
+                           info->count, u_prim_name(info->mode));
+                return;
+        }
+
+        /* Before setting up the draw, do any fixup blits necessary. */
+        vc4_update_shadow_textures(pctx, &vc4->verttex);
+        vc4_update_shadow_textures(pctx, &vc4->fragtex);
+
+        vc4_get_draw_cl_space(vc4);
+
+        if (vc4->prim_mode != info->mode) {
+                vc4->prim_mode = info->mode;
+                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
+        }
+
+        vc4_start_draw(vc4);
+        vc4_update_compiled_shaders(vc4, info->mode);
+
+        vc4_emit_state(pctx);
+
+        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
+                           VC4_DIRTY_VTXSTATE |
+                           VC4_DIRTY_PRIM_MODE |
+                           VC4_DIRTY_RASTERIZER |
+                           VC4_DIRTY_COMPILED_CS |
+                           VC4_DIRTY_COMPILED_VS |
+                           VC4_DIRTY_COMPILED_FS |
+                           vc4->prog.cs->uniform_dirty_bits |
+                           vc4->prog.vs->uniform_dirty_bits |
+                           vc4->prog.fs->uniform_dirty_bits)) ||
+            vc4->last_index_bias != info->index_bias) {
+                vc4_emit_gl_shader_state(vc4, info);
+        }
+
+        vc4->dirty = 0;
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
          */
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (info->indexed) {
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
@@ -251,25 +289,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
-                cl_start_reloc(&vc4->bcl, 1);
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
-                cl_u8(&vc4->bcl,
+                cl_start_reloc(&vc4->bcl, &bcl, 1);
+                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
+                cl_u8(&bcl,
                       info->mode |
                       (index_size == 2 ?
                        VC4_INDEX_BUFFER_U16:
                        VC4_INDEX_BUFFER_U8));
-                cl_u32(&vc4->bcl, info->count);
-                cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
-                cl_u32(&vc4->bcl, max_index);
+                cl_u32(&bcl, info->count);
+                cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
+                cl_u32(&bcl, vc4->max_index);
 
                 if (vc4->indexbuf.index_size == 4)
                         pipe_resource_reference(&prsc, NULL);
         } else {
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                cl_u8(&vc4->bcl, info->mode);
-                cl_u32(&vc4->bcl, info->count);
-                cl_u32(&vc4->bcl, info->start);
+                cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
+                cl_u8(&bcl, info->mode);
+                cl_u32(&bcl, info->count);
+                cl_u32(&bcl, info->start);
         }
+        cl_end(&vc4->bcl, bcl);
 
         if (vc4->zsa && vc4->zsa->base.depth.enabled) {
                 vc4->resolve |= PIPE_CLEAR_DEPTH;
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 5f1ee4fa125..863ef8da8fb 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -31,12 +31,14 @@
 #define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
 #define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -183,6 +185,29 @@ struct drm_vc4_create_bo {
 };
 
 /**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
+/**
  * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
  *
  * This doesn't actually perform an mmap.  Instead, it returns the
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index d2b54fccf91..ba064ff889b 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -28,23 +28,24 @@ vc4_emit_state(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
                 float *vpscale = vc4->viewport.scale;
                 float *vptranslate = vc4->viewport.translate;
-                float vp_minx = -fabs(vpscale[0]) + vptranslate[0];
-                float vp_maxx = fabs(vpscale[0]) + vptranslate[0];
-                float vp_miny = -fabs(vpscale[1]) + vptranslate[1];
-                float vp_maxy = fabs(vpscale[1]) + vptranslate[1];
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
                 uint32_t minx = MAX2(vc4->scissor.minx, vp_minx);
                 uint32_t miny = MAX2(vc4->scissor.miny, vp_miny);
                 uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
                 uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW);
-                cl_u16(&vc4->bcl, minx);
-                cl_u16(&vc4->bcl, miny);
-                cl_u16(&vc4->bcl, maxx - minx);
-                cl_u16(&vc4->bcl, maxy - miny);
+                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
+                cl_u16(&bcl, minx);
+                cl_u16(&bcl, miny);
+                cl_u16(&bcl, maxx - minx);
+                cl_u16(&bcl, maxy - miny);
 
                 vc4->draw_min_x = MIN2(vc4->draw_min_x, minx);
                 vc4->draw_min_y = MIN2(vc4->draw_min_y, miny);
@@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx)
         }
 
         if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[0] |
                       vc4->zsa->config_bits[0]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[1] |
                       vc4->zsa->config_bits[1]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[2] |
                       vc4->zsa->config_bits[2]);
         }
 
         if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
-                cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_units);
+                cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET);
+                cl_u16(&bcl, vc4->rasterizer->offset_factor);
+                cl_u16(&bcl, vc4->rasterizer->offset_units);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE);
-                cl_f(&vc4->bcl, vc4->rasterizer->point_size);
+                cl_u8(&bcl, VC4_PACKET_POINT_SIZE);
+                cl_f(&bcl, vc4->rasterizer->point_size);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH);
-                cl_f(&vc4->bcl, vc4->rasterizer->base.line_width);
+                cl_u8(&bcl, VC4_PACKET_LINE_WIDTH);
+                cl_f(&bcl, vc4->rasterizer->base.line_width);
         }
 
         if (vc4->dirty & VC4_DIRTY_VIEWPORT) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f);
-                cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING);
+                cl_f(&bcl, vc4->viewport.scale[0] * 16.0f);
+                cl_f(&bcl, vc4->viewport.scale[1] * 16.0f);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.translate[2]);
-                cl_f(&vc4->bcl, vc4->viewport.scale[2]);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING);
+                cl_f(&bcl, vc4->viewport.translate[2]);
+                cl_f(&bcl, vc4->viewport.scale[2]);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]);
+                cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[0]);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[1]);
         }
 
         if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) {
-                cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
-                cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ?
+                cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
+                cl_u32(&bcl, vc4->rasterizer->base.flatshade ?
                        vc4->prog.fs->color_inputs : 0);
         }
+
+        cl_end(&vc4->bcl, bcl);
 }
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f2ee91de61a..b6fb2a8a460 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -60,16 +60,6 @@ vc4_fence_reference(struct pipe_screen *pscreen,
 }
 
 static boolean
-vc4_fence_signalled(struct pipe_screen *pscreen,
-                    struct pipe_fence_handle *pf)
-{
-        struct vc4_screen *screen = vc4_screen(pscreen);
-        struct vc4_fence *f = (struct vc4_fence *)pf;
-
-        return vc4_wait_seqno(screen, f->seqno, 0);
-}
-
-static boolean
 vc4_fence_finish(struct pipe_screen *pscreen,
                  struct pipe_fence_handle *pf,
                  uint64_t timeout_ns)
@@ -77,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen,
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_fence *f = (struct vc4_fence *)pf;
 
-        return vc4_wait_seqno(screen, f->seqno, timeout_ns);
+        return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait");
 }
 
 struct vc4_fence *
@@ -98,6 +88,5 @@ void
 vc4_fence_init(struct vc4_screen *screen)
 {
         screen->base.fence_reference = vc4_fence_reference;
-        screen->base.fence_signalled = vc4_fence_signalled;
         screen->base.fence_finish = vc4_fence_finish;
 }
diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c
index 004bac70c67..ffce61237de 100644
--- a/src/gallium/drivers/vc4/vc4_formats.c
+++ b/src/gallium/drivers/vc4/vc4_formats.c
@@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = {
 static const struct vc4_format *
 get_format(enum pipe_format f)
 {
-        if (f > ARRAY_SIZE(vc4_format_table) ||
+        if (f >= ARRAY_SIZE(vc4_format_table) ||
             !vc4_format_table[f].present)
                 return NULL;
         else
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index dcade15443a..7ebd9f160eb 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -44,8 +44,7 @@ void
 vc4_job_reset(struct vc4_context *vc4)
 {
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
@@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4)
 {
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
-                vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
+                vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false);
         }
 
         struct drm_vc4_submit_cl submit;
@@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4)
                                      vc4->zs_write, true, true);
 
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
-        submit.bo_handle_count = (vc4->bo_handles.next -
-                                  vc4->bo_handles.base) / 4;
+        submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
-        submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
+        submit.bin_cl_size = cl_offset(&vc4->bcl);
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
-        submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
+        submit.shader_rec_size = cl_offset(&vc4->shader_rec);
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
-        submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
+        submit.uniforms_size = cl_offset(&vc4->uniforms);
 
         assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
         submit.min_x_tile = vc4->draw_min_x / 64;
@@ -207,7 +205,7 @@ vc4_job_submit(struct vc4_context *vc4)
 
         if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                 if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
-                                    PIPE_TIMEOUT_INFINITE)) {
+                                    PIPE_TIMEOUT_INFINITE, "sync")) {
                         fprintf(stderr, "Wait failed.\n");
                         abort();
                 }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
new file mode 100644
index 00000000000..a372a6c0cdc
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements most of the fixed function fragment pipeline in shader code.
+ *
+ * VC4 doesn't have any hardware support for blending, alpha test, logic ops,
+ * or color mask.  Instead, you read the current contents of the destination
+ * from the tile buffer after having waited for the scoreboard (which is
+ * handled by vc4_qpu_emit.c), then do math using your output color and that
+ * destination value, and update the output color appropriately.
+ */
+
+/**
+ * Lowers fixed-function blending to a load of the destination color and a
+ * series of ALU operations before the store of the output.
+ */
+#include "util/u_format.h"
+#include "vc4_qir.h"
+#include "glsl/nir/nir_builder.h"
+#include "vc4_context.h"
+
+/** Emits a load of the previous fragment color from the tile buffer. */
+static nir_ssa_def *
+vc4_nir_get_dst_color(nir_builder *b)
+{
+        nir_intrinsic_instr *load =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_input);
+        load->num_components = 1;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+        return &load->dest.ssa;
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
+{
+        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
+        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
+        nir_ssa_def *high = nir_fpow(b,
+                                     nir_fmul(b,
+                                              nir_fadd(b, srgb,
+                                                       nir_imm_float(b, 0.055)),
+                                              nir_imm_float(b, 1.0 / 1.055)),
+                                     nir_imm_float(b, 2.4));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
+{
+        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
+        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
+        nir_ssa_def *high = nir_fsub(b,
+                                     nir_fmul(b,
+                                              nir_imm_float(b, 1.055),
+                                              nir_fpow(b,
+                                                       linear,
+                                                       nir_imm_float(b, 0.41666))),
+                                     nir_imm_float(b, 0.055));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static nir_ssa_def *
+vc4_blend_channel(nir_builder *b,
+                  nir_ssa_def **src,
+                  nir_ssa_def **dst,
+                  unsigned factor,
+                  int channel)
+{
+        switch(factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_float(b, 1.0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src[3];
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst[3];
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                if (channel != 3) {
+                        return nir_fmin(b,
+                                        src[3],
+                                        nir_fsub(b,
+                                                 nir_imm_float(b, 1.0),
+                                                 dst[3]));
+                } else {
+                        return nir_imm_float(b, 1.0);
+                }
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_float(b, 0.0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[3]);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_float(b, 1.0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+               unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_fadd(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_fsub(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_fsub(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_fmin(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_fmax(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
+static void
+vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable) {
+                for (int i = 0; i < 4; i++)
+                        result[i] = src_color[i];
+                return;
+        }
+
+        /* Clamp the src color to [0, 1].  Dest is already clamped. */
+        for (int i = 0; i < 4; i++)
+                src_color[i] = nir_fsat(b, src_color[i]);
+
+        nir_ssa_def *src_blend[4], *dst_blend[4];
+        for (int i = 0; i < 4; i++) {
+                int src_factor = ((i != 3) ? blend->rgb_src_factor :
+                                  blend->alpha_src_factor);
+                int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
+                                  blend->alpha_dst_factor);
+                src_blend[i] = nir_fmul(b, src_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          src_factor, i));
+                dst_blend[i] = nir_fmul(b, dst_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          dst_factor, i));
+        }
+
+        for (int i = 0; i < 4; i++) {
+                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
+                                           ((i != 3) ? blend->rgb_func :
+                                            blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_pipe_compare_func(nir_builder *b, int func,
+                          nir_ssa_def *src0, nir_ssa_def *src1)
+{
+        switch (func) {
+        default:
+                fprintf(stderr, "Unknown compare func %d\n", func);
+                /* FALLTHROUGH */
+        case PIPE_FUNC_NEVER:
+                return nir_imm_int(b, 0);
+        case PIPE_FUNC_ALWAYS:
+                return nir_imm_int(b, ~0);
+        case PIPE_FUNC_EQUAL:
+                return nir_feq(b, src0, src1);
+        case PIPE_FUNC_NOTEQUAL:
+                return nir_fne(b, src0, src1);
+        case PIPE_FUNC_GREATER:
+                return nir_flt(b, src1, src0);
+        case PIPE_FUNC_GEQUAL:
+                return nir_fge(b, src0, src1);
+        case PIPE_FUNC_LESS:
+                return nir_flt(b, src0, src1);
+        case PIPE_FUNC_LEQUAL:
+                return nir_fge(b, src1, src0);
+        }
+}
+
+static void
+vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
+                                nir_ssa_def *alpha)
+{
+        if (!c->fs_key->alpha_test)
+                return;
+
+        nir_ssa_def *alpha_ref =
+                vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF);
+        nir_ssa_def *condition =
+                vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func,
+                                          alpha, alpha_ref);
+
+        nir_intrinsic_instr *discard =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_discard_if);
+        discard->num_components = 1;
+        discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
+        nir_builder_instr_insert(b, &discard->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        /* Pull out the float src/dst color components. */
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
+        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
+                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+        }
+
+        /* Unswizzle the destination color. */
+        nir_ssa_def *dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                            unpacked_dst_color,
+                                                            format_swiz[i]);
+        }
+
+        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
+
+        /* Turn dst color to linear. */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+        }
+
+        nir_ssa_def *blend_color[4];
+        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+
+        /* sRGB encode the output color */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+        }
+
+        nir_ssa_def *swizzled_outputs[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled_outputs[i] =
+                        vc4_nir_get_swizzled_channel(b, blend_color,
+                                                     format_swiz[i]);
+        }
+
+        nir_ssa_def *packed_color =
+                nir_pack_unorm_4x8(b,
+                                   nir_vec4(b,
+                                            swizzled_outputs[0],
+                                            swizzled_outputs[1],
+                                            swizzled_outputs[2],
+                                            swizzled_outputs[3]));
+
+        packed_color = vc4_logicop(b, c->fs_key->logicop_func,
+                                   packed_color, packed_dst_color);
+
+        /* If the bit isn't set in the color mask, then just return the
+         * original dst color, instead.
+         */
+        uint32_t colormask = 0xffffffff;
+        for (int i = 0; i < 4; i++) {
+                if (format_swiz[i] < 4 &&
+                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
+                        colormask &= ~(0xff << (i * 8));
+                }
+        }
+        packed_color = nir_ior(b,
+                               nir_iand(b, packed_color,
+                                        nir_imm_int(b, colormask)),
+                               nir_iand(b, packed_dst_color,
+                                        nir_imm_int(b, ~colormask)));
+
+        /* Turn the old vec4 output into a store of the packed color. */
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(packed_color));
+        intr->num_components = 1;
+}
+
+static bool
+vc4_nir_lower_blend_block(nir_block *block, void *state)
+{
+        struct vc4_compile *c = state;
+
+        nir_foreach_instr(block, instr) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_variable *output_var = NULL;
+                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                        if (var->data.driver_location == intr->const_index[0]) {
+                                output_var = var;
+                                break;
+                        }
+                }
+                assert(output_var);
+                unsigned semantic_name = output_var->data.location;
+
+                if (semantic_name != TGSI_SEMANTIC_COLOR)
+                        continue;
+
+                nir_function_impl *impl =
+                        nir_cf_node_get_function(&block->cf_node);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                nir_builder_insert_before_instr(&b, &intr->instr);
+                vc4_nir_lower_blend_instr(c, &b, intr);
+        }
+        return true;
+}
+
+void
+vc4_nir_lower_blend(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl) {
+                        nir_foreach_block(overload->impl,
+                                          vc4_nir_lower_blend_block, c);
+
+                        nir_metadata_preserve(overload->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
new file mode 100644
index 00000000000..229d41147d8
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
+ * something amenable to the VC4 architecture.
+ *
+ * Currently, it split inputs, outputs, and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.
+ */
+
+static void
+replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
+                            nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vec4.  This will get split by the
+         * later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vec4.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec),
+                                 ralloc_parent(b->impl));
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
+            VC4_NIR_TLB_COLOR_READ_INPUT) {
+                /* This doesn't need any lowering. */
+                return;
+        }
+
+        nir_variable *input_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        input_var = var;
+                        break;
+                }
+        }
+        assert(input_var);
+        int semantic_name = input_var->data.location;
+        int semantic_index = input_var->data.index;
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                dests[i] = &intr_comp->dest.ssa;
+        }
+
+        switch (c->stage) {
+        case QSTAGE_FRAG:
+                switch (semantic_name) {
+                case TGSI_SEMANTIC_FACE:
+                        dests[0] = nir_fsub(b,
+                                            nir_imm_float(b, 1.0),
+                                            nir_fmul(b,
+                                                     nir_i2f(b, dests[0]),
+                                                     nir_imm_float(b, 2.0)));
+                        dests[1] = nir_imm_float(b, 0.0);
+                        dests[2] = nir_imm_float(b, 0.0);
+                        dests[3] = nir_imm_float(b, 1.0);
+                        break;
+                case TGSI_SEMANTIC_GENERIC:
+                        if (c->fs_key->point_sprite_mask &
+                            (1 << semantic_index)) {
+                                if (!c->fs_key->is_points) {
+                                        dests[0] = nir_imm_float(b, 0.0);
+                                        dests[1] = nir_imm_float(b, 0.0);
+                                }
+                                if (c->fs_key->point_coord_upper_left) {
+                                        dests[1] = nir_fsub(b,
+                                                            nir_imm_float(b, 1.0),
+                                                            dests[1]);
+                                }
+                                dests[2] = nir_imm_float(b, 0.0);
+                                dests[3] = nir_imm_float(b, 1.0);
+                        }
+                        break;
+                }
+                break;
+        case QSTAGE_COORD:
+        case QSTAGE_VERT:
+                break;
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+        unsigned semantic_name = output_var->data.location;
+
+        if (c->stage == QSTAGE_COORD &&
+            (semantic_name != TGSI_SEMANTIC_POSITION &&
+             semantic_name != TGSI_SEMANTIC_PSIZE)) {
+                nir_instr_remove(&intr->instr);
+                return;
+        }
+
+        /* Color output is lowered by vc4_nir_lower_blend(). */
+        if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) {
+                intr->const_index[0] *= 4;
+                return;
+        }
+
+        /* All TGSI-to-NIR outputs are VEC4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+
+                assert(intr->src[0].is_ssa);
+                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
+                                                                intr->src[0].ssa,
+                                                                &i, 1, false));
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        /* All TGSI-to-NIR uniform loads are vec4, but we may create dword
+         * loads in our lowering passes.
+         */
+        if (intr->num_components == 1)
+                return;
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+
+                if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) {
+                        /* Convert the variable TGSI register index to a byte
+                         * offset.
+                         */
+                        intr_comp->src[0] =
+                                nir_src_for_ssa(nir_ishl(b,
+                                                         intr->src[0].ssa,
+                                                         nir_imm_int(b, 4)));
+
+                        /* Convert the offset to be a byte index, too. */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 16 +
+                                                     i * 4);
+                } else {
+                        /* We want a dword index for non-indirect uniform
+                         * loads.
+                         */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 4 +
+                                                     i);
+                }
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                vc4_nir_lower_input(c, b, intr);
+                break;
+
+        case nir_intrinsic_store_output:
+                vc4_nir_lower_output(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_uniform_indirect:
+                vc4_nir_lower_uniform(c, b, intr);
+                break;
+
+        default:
+                break;
+        }
+}
+
+static bool
+vc4_nir_lower_io_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr)
+                vc4_nir_lower_io_instr(c, &b, instr);
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_io_block, c);
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_io(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_io_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 
                 if (inst->op == QOP_MOV &&
                     inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM &&
-                    !(inst->src[0].file == QFILE_TEMP &&
-                      (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
-                       c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+                    inst->src[0].file != QFILE_VPM) {
                         movs[inst->dst.index] = inst->src[0];
                 }
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 92c8260eb59..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
         struct qreg src[4];
         /**
          * If the instruction depends on the flags, how many SFs have been
-         * seen before this instruction, or if it depends on r4, how many r4
-         * writes have been seen.
+         * seen before this instruction.
          */
         uint32_t implicit_arg_update_count;
 };
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
 
 static struct qinst *
 vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
-             struct qinst *inst, uint32_t sf_count,
-             uint32_t r4_count)
+             struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
             inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
                qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
         if (qir_depends_on_flags(inst))
                 key.implicit_arg_update_count = sf_count;
-        if (qir_reads_r4(inst))
-                key.implicit_arg_update_count = r4_count;
 
         uint32_t hash = _mesa_hash_data(&key, sizeof(key));
         struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        uint32_t sf_count = 0, r4_count = 0;
+        uint32_t sf_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                         inst_key_equals);
@@ -130,15 +126,15 @@ qir_opt_cse(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_TLB_COLOR_READ) {
                         continue;
                 }
 
                 if (inst->sf) {
                         sf_count++;
                 } else {
-                        struct qinst *cse = vc4_find_cse(c, ht, inst,
-                                                         sf_count, r4_count);
+                        struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
                         if (cse) {
                                 inst->src[0] = cse->dst;
                                 for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -154,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
                                 }
                         }
                 }
-
-                if (qir_writes_r4(inst))
-                        r4_count++;
         }
 
         ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ba47c51d9bd..13c472152d8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -23,21 +23,19 @@
  */
 
 #include <inttypes.h>
-#include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_hash.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pack_color.h"
-#include "util/format_srgb.h"
 #include "util/ralloc.h"
 #include "util/hash_table.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_parse.h"
+#include "glsl/nir/nir.h"
+#include "glsl/nir/nir_builder.h"
 #include "nir/tgsi_to_nir.h"
-
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
@@ -45,51 +43,8 @@
 #include "simpenrose/simpenrose.h"
 #endif
 
-struct vc4_key {
-        struct vc4_uncompiled_shader *shader_state;
-        struct {
-                enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
-                uint8_t swizzle[4];
-        } tex[VC4_MAX_TEXTURE_SAMPLERS];
-        uint8_t ucp_enables;
-};
-
-struct vc4_fs_key {
-        struct vc4_key base;
-        enum pipe_format color_format;
-        bool depth_enabled;
-        bool stencil_enabled;
-        bool stencil_twoside;
-        bool stencil_full_writemasks;
-        bool is_points;
-        bool is_lines;
-        bool alpha_test;
-        bool point_coord_upper_left;
-        bool light_twoside;
-        uint8_t alpha_test_func;
-        uint8_t logicop_func;
-        uint32_t point_sprite_mask;
-
-        struct pipe_rt_blend_state blend;
-};
-
-struct vc4_vs_key {
-        struct vc4_key base;
-
-        /**
-         * This is a proxy for the array of FS input semantics, which is
-         * larger than we would want to put in the key.
-         */
-        uint64_t compiled_fs_id;
-
-        enum pipe_format attr_formats[8];
-        bool is_coord;
-        bool per_vertex_point_size;
-};
+static struct qreg
+ntq_get_src(struct vc4_compile *c, nir_src src, int i);
 
 static void
 resize_qreg_array(struct vc4_compile *c,
@@ -113,10 +68,10 @@ resize_qreg_array(struct vc4_compile *c,
 }
 
 static struct qreg
-indirect_uniform_load(struct vc4_compile *c,
-                      struct qreg indirect_offset,
-                      unsigned offset)
+indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
+        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+        uint32_t offset = intr->const_index[0];
         struct vc4_compiler_ubo_range *range = NULL;
         unsigned i;
         for (i = 0; i < c->num_uniform_ranges; i++) {
@@ -138,10 +93,6 @@ indirect_uniform_load(struct vc4_compile *c,
         };
 
         offset -= range->src_offset;
-        /* Translate the user's TGSI register index from the TGSI register
-         * base to a byte offset.
-         */
-        indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
 
         /* Adjust for where we stored the TGSI register base. */
         indirect_offset = qir_ADD(c, indirect_offset,
@@ -155,24 +106,70 @@ indirect_uniform_load(struct vc4_compile *c,
                                                      range->size - 4)));
 
         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
-        struct qreg r4 = qir_TEX_RESULT(c);
         c->num_texture_samples++;
-        return qir_MOV(c, r4);
+        return qir_TEX_RESULT(c);
 }
 
-static struct qreg *
-ntq_get_dest(struct vc4_compile *c, nir_dest dest)
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents)
 {
-        assert(!dest.is_ssa);
-        nir_register *reg = dest.reg.reg;
-        struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg);
-        assert(reg->num_array_elems == 0);
-        assert(dest.reg.base_offset == 0);
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_uniform);
+        intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents;
+        intr->num_components = 1;
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_builder_instr_insert(b, &intr->instr);
+        return &intr->dest.ssa;
+}
 
-        struct qreg *qregs = entry->data;
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
+static struct qreg *
+ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
+{
+        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                          def->num_components);
+        _mesa_hash_table_insert(c->def_ht, def, qregs);
         return qregs;
 }
 
+static struct qreg *
+ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
+{
+        if (dest->is_ssa) {
+                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
+                for (int i = 0; i < dest->ssa.num_components; i++)
+                        qregs[i] = c->undef;
+                return qregs;
+        } else {
+                nir_register *reg = dest->reg.reg;
+                assert(dest->reg.base_offset == 0);
+                assert(reg->num_array_elems == 0);
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, reg);
+                return entry->data;
+        }
+}
+
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
 {
@@ -282,22 +279,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
 }
 
 static struct qreg
-qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
-{
-        struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
-        struct qreg high = qir_FSUB(c,
-                                    qir_FMUL(c,
-                                             qir_uniform_f(c, 1.055),
-                                             qir_POW(c,
-                                                     linear,
-                                                     qir_uniform_f(c, 0.41666))),
-                                    qir_uniform_f(c, 0.055));
-
-        qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
-        return qir_SEL_X_Y_NS(c, low, high);
-}
-
-static struct qreg
 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 {
         struct qreg src0_hi = qir_SHR(c, src0,
@@ -410,13 +391,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 
         c->num_texture_samples++;
-        struct qreg r4 = qir_TEX_RESULT(c);
+        struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
                                                          qir_uniform_ui(c, 8)));
                 struct qreg normalized = qir_FMUL(c, depthf,
                                                   qir_uniform_f(c, 1.0f/0xffffff));
@@ -468,7 +449,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         unpacked[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_R4_UNPACK(c, r4, i);
+                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -484,7 +465,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                                             texture_output[i]);
         }
 
-        struct qreg *dest = ntq_get_dest(c, instr->dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
                 dest[i] = get_swizzled_channel(c, texture_output,
                                                c->key->tex[unit].swizzle[i]);
@@ -558,7 +539,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src)
         struct qreg scaled_x =
                 qir_FMUL(c,
                          src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
 
         struct qreg x = qir_FADD(c,
                                  ntq_ffract(c, scaled_x),
@@ -756,26 +737,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr)
         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
 }
 
-static void
-emit_point_coord_input(struct vc4_compile *c, int attr)
-{
-        if (c->point_x.file == QFILE_NULL) {
-                c->point_x = qir_uniform_f(c, 0.0);
-                c->point_y = qir_uniform_f(c, 0.0);
-        }
-
-        c->inputs[attr * 4 + 0] = c->point_x;
-        if (c->fs_key->point_coord_upper_left) {
-                c->inputs[attr * 4 + 1] = qir_FSUB(c,
-                                                   qir_uniform_f(c, 1.0),
-                                                   c->point_y);
-        } else {
-                c->inputs[attr * 4 + 1] = c->point_y;
-        }
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static struct qreg
 emit_fragment_varying(struct vc4_compile *c, uint8_t semantic,
                       uint8_t index, uint8_t swizzle)
@@ -817,19 +778,6 @@ emit_fragment_input(struct vc4_compile *c, int attr,
 }
 
 static void
-emit_face_input(struct vc4_compile *c, int attr)
-{
-        c->inputs[attr * 4 + 0] = qir_FSUB(c,
-                                           qir_uniform_f(c, 1.0),
-                                           qir_FMUL(c,
-                                                    qir_ITOF(c, qir_FRAG_REV_FLAG(c)),
-                                                    qir_uniform_f(c, 2.0)));
-        c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
-static void
 add_output(struct vc4_compile *c,
            uint32_t decl_offset,
            uint8_t semantic_name,
@@ -884,12 +832,38 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
-                struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         dest[i] = srcs[i];
                 return;
         }
 
+        if (instr->op == nir_op_pack_unorm_4x8) {
+                struct qreg result;
+                for (int i = 0; i < 4; i++) {
+                        struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                                      instr->src[0].swizzle[i]);
+                        if (i == 0)
+                                result = qir_PACK_8888_F(c, src);
+                        else
+                                result = qir_PACK_8_F(c, result, src, i);
+                }
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = result;
+                return;
+        }
+
+        if (instr->op == nir_op_unpack_unorm_4x8) {
+                struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                              instr->src[0].swizzle[0]);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                for (int i = 0; i < 4; i++) {
+                        if (instr->dest.write_mask & (1 << i))
+                                dest[i] = qir_UNPACK_8_F(c, src, i);
+                }
+                return;
+        }
+
         /* General case: We can just grab the one used channel per src. */
         struct qreg src[nir_op_infos[instr->op].num_inputs];
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
@@ -898,7 +872,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 
         /* Pick the channel to store the output in. */
         assert(!instr->dest.saturate);
-        struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
         assert(util_is_power_of_two(instr->dest.write_mask));
         dest += ffs(instr->dest.write_mask) - 1;
 
@@ -1092,167 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 }
 
-static struct qreg
-vc4_blend_channel(struct vc4_compile *c,
-                  struct qreg *dst,
-                  struct qreg *src,
-                  struct qreg val,
-                  unsigned factor,
-                  int channel)
-{
-        switch(factor) {
-        case PIPE_BLENDFACTOR_ONE:
-                return val;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return qir_FMUL(c, val, src[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return qir_FMUL(c, val, src[3]);
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return qir_FMUL(c, val, dst[3]);
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return qir_FMUL(c, val, dst[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                if (channel != 3) {
-                        return qir_FMUL(c,
-                                        val,
-                                        qir_FMIN(c,
-                                                 src[3],
-                                                 qir_FSUB(c,
-                                                          qir_uniform_f(c, 1.0),
-                                                          dst[3])));
-                } else {
-                        return val;
-                }
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
-                                            channel));
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
-        case PIPE_BLENDFACTOR_ZERO:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[channel]));
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[3]));
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[3]));
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[channel]));
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel)));
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3)));
-
-        default:
-        case PIPE_BLENDFACTOR_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_SRC1_ALPHA:
-        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend factor %d\n", factor);
-                return val;
-        }
-}
-
-static struct qreg
-vc4_blend_func(struct vc4_compile *c,
-               struct qreg src, struct qreg dst,
-               unsigned func)
-{
-        switch (func) {
-        case PIPE_BLEND_ADD:
-                return qir_FADD(c, src, dst);
-        case PIPE_BLEND_SUBTRACT:
-                return qir_FSUB(c, src, dst);
-        case PIPE_BLEND_REVERSE_SUBTRACT:
-                return qir_FSUB(c, dst, src);
-        case PIPE_BLEND_MIN:
-                return qir_FMIN(c, src, dst);
-        case PIPE_BLEND_MAX:
-                return qir_FMAX(c, src, dst);
-
-        default:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend func %d\n", func);
-                return src;
-
-        }
-}
-
-/**
- * Implements fixed function blending in shader code.
- *
- * VC4 doesn't have any hardware support for blending.  Instead, you read the
- * current contents of the destination from the tile buffer after having
- * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
- * math using your output color and that destination value, and update the
- * output color appropriately.
- */
-static void
-vc4_blend(struct vc4_compile *c, struct qreg *result,
-          struct qreg *dst_color, struct qreg *src_color)
-{
-        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
-
-        if (!blend->blend_enable) {
-                for (int i = 0; i < 4; i++)
-                        result[i] = src_color[i];
-                return;
-        }
-
-        struct qreg clamped_src[4];
-        struct qreg clamped_dst[4];
-        for (int i = 0; i < 4; i++) {
-                clamped_src[i] = qir_SAT(c, src_color[i]);
-                clamped_dst[i] = qir_SAT(c, dst_color[i]);
-        }
-        src_color = clamped_src;
-        dst_color = clamped_dst;
-
-        struct qreg src_blend[4], dst_blend[4];
-        for (int i = 0; i < 3; i++) {
-                src_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 src_color[i],
-                                                 blend->rgb_src_factor, i);
-                dst_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 dst_color[i],
-                                                 blend->rgb_dst_factor, i);
-        }
-        src_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         src_color[3],
-                                         blend->alpha_src_factor, 3);
-        dst_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         dst_color[3],
-                                         blend->alpha_dst_factor, 3);
-
-        for (int i = 0; i < 3; i++) {
-                result[i] = vc4_blend_func(c,
-                                           src_blend[i], dst_blend[i],
-                                           blend->rgb_func);
-        }
-        result[3] = vc4_blend_func(c,
-                                   src_blend[3], dst_blend[3],
-                                   blend->alpha_func);
-}
-
 static void
 clip_distance_discard(struct vc4_compile *c)
 {
@@ -1276,167 +1089,15 @@ clip_distance_discard(struct vc4_compile *c)
 }
 
 static void
-alpha_test_discard(struct vc4_compile *c)
-{
-        struct qreg src_alpha;
-        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
-
-        if (!c->fs_key->alpha_test)
-                return;
-
-        if (c->output_color_index != -1)
-                src_alpha = c->outputs[c->output_color_index + 3];
-        else
-                src_alpha = qir_uniform_f(c, 1.0);
-
-        if (c->discard.file == QFILE_NULL)
-                c->discard = qir_uniform_ui(c, 0);
-
-        switch (c->fs_key->alpha_test_func) {
-        case PIPE_FUNC_NEVER:
-                c->discard = qir_uniform_ui(c, ~0);
-                break;
-        case PIPE_FUNC_ALWAYS:
-                break;
-        case PIPE_FUNC_EQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_NOTEQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GREATER:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LESS:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        }
-}
-
-static struct qreg
-vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
-{
-        switch (c->fs_key->logicop_func) {
-        case PIPE_LOGICOP_CLEAR:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_LOGICOP_NOR:
-                return qir_NOT(c, qir_OR(c, src, dst));
-        case PIPE_LOGICOP_AND_INVERTED:
-                return qir_AND(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_COPY_INVERTED:
-                return qir_NOT(c, src);
-        case PIPE_LOGICOP_AND_REVERSE:
-                return qir_AND(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_INVERT:
-                return qir_NOT(c, dst);
-        case PIPE_LOGICOP_XOR:
-                return qir_XOR(c, src, dst);
-        case PIPE_LOGICOP_NAND:
-                return qir_NOT(c, qir_AND(c, src, dst));
-        case PIPE_LOGICOP_AND:
-                return qir_AND(c, src, dst);
-        case PIPE_LOGICOP_EQUIV:
-                return qir_NOT(c, qir_XOR(c, src, dst));
-        case PIPE_LOGICOP_NOOP:
-                return dst;
-        case PIPE_LOGICOP_OR_INVERTED:
-                return qir_OR(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_OR_REVERSE:
-                return qir_OR(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_OR:
-                return qir_OR(c, src, dst);
-        case PIPE_LOGICOP_SET:
-                return qir_uniform_ui(c, ~0);
-        case PIPE_LOGICOP_COPY:
-        default:
-                return src;
-        }
-}
-
-static void
 emit_frag_end(struct vc4_compile *c)
 {
         clip_distance_discard(c);
-        alpha_test_discard(c);
-
-        enum pipe_format color_format = c->fs_key->color_format;
-        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
-        struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg packed_dst_color = c->undef;
-
-        if (c->fs_key->blend.blend_enable ||
-            c->fs_key->blend.colormask != 0xf ||
-            c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                struct qreg r4 = qir_TLB_COLOR_READ(c);
-                for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
-                for (int i = 0; i < 4; i++) {
-                        dst_color[i] = get_swizzled_channel(c,
-                                                            tlb_read_color,
-                                                            format_swiz[i]);
-                        if (util_format_is_srgb(color_format) && i != 3) {
-                                linear_dst_color[i] =
-                                        qir_srgb_decode(c, dst_color[i]);
-                        } else {
-                                linear_dst_color[i] = dst_color[i];
-                        }
-                }
 
-                /* Save the packed value for logic ops.  Can't reuse r4
-                 * because other things might smash it (like sRGB)
-                 */
-                packed_dst_color = qir_MOV(c, r4);
-        }
-
-        struct qreg blend_color[4];
-        struct qreg undef_array[4] = {
-                c->undef, c->undef, c->undef, c->undef
-        };
-        vc4_blend(c, blend_color, linear_dst_color,
-                  (c->output_color_index != -1 ?
-                   c->outputs + c->output_color_index :
-                   undef_array));
-
-        if (util_format_is_srgb(color_format)) {
-                for (int i = 0; i < 3; i++)
-                        blend_color[i] = qir_srgb_encode(c, blend_color[i]);
-        }
-
-        /* Debug: Sometimes you're getting a black output and just want to see
-         * if the FS is getting executed at all.  Spam magenta into the color
-         * output.
-         */
-        if (0) {
-                blend_color[0] = qir_uniform_f(c, 1.0);
-                blend_color[1] = qir_uniform_f(c, 0.0);
-                blend_color[2] = qir_uniform_f(c, 1.0);
-                blend_color[3] = qir_uniform_f(c, 0.5);
-        }
-
-        struct qreg swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
-                                                           format_swiz[i]);
+        struct qreg color;
+        if (c->output_color_index != -1) {
+                color = c->outputs[c->output_color_index];
+        } else {
+                color = qir_uniform_ui(c, 0);
         }
 
         if (c->discard.file != QFILE_NULL)
@@ -1463,47 +1124,7 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        struct qreg packed_color = c->undef;
-        for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file == QFILE_NULL)
-                        continue;
-                if (packed_color.file == QFILE_NULL) {
-                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
-                } else {
-                        packed_color = qir_PACK_8_F(c,
-                                                    packed_color,
-                                                    swizzled_outputs[i],
-                                                    i);
-                }
-        }
-
-        if (packed_color.file == QFILE_NULL)
-                packed_color = qir_uniform_ui(c, 0);
-
-        if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_color = vc4_logicop(c, packed_color, packed_dst_color);
-        }
-
-        /* If the bit isn't set in the color mask, then just return the
-         * original dst color, instead.
-         */
-        uint32_t colormask = 0xffffffff;
-        for (int i = 0; i < 4; i++) {
-                if (format_swiz[i] < 4 &&
-                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
-                        colormask &= ~(0xff << (i * 8));
-                }
-        }
-        if (colormask != 0xffffffff) {
-                packed_color = qir_OR(c,
-                                      qir_AND(c, packed_color,
-                                              qir_uniform_ui(c, colormask)),
-                                      qir_AND(c, packed_dst_color,
-                                              qir_uniform_ui(c, ~colormask)));
-        }
-
-        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef,
-                             packed_color, c->undef));
+        qir_TLB_COLOR_WRITE(c, color);
 }
 
 static void
@@ -1695,6 +1316,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 progress = nir_opt_peephole_select(s) || progress;
                 progress = nir_opt_algebraic(s) || progress;
                 progress = nir_opt_constant_folding(s) || progress;
+                progress = nir_opt_undef(s) || progress;
         } while (progress);
 }
 
@@ -1736,6 +1358,7 @@ ntq_setup_inputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location;
 
                 assert(array_len == 1);
+                (void)array_len;
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
 
@@ -1743,11 +1366,12 @@ ntq_setup_inputs(struct vc4_compile *c)
                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
                                 emit_fragcoord_input(c, loc);
                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
-                                emit_face_input(c, loc);
+                                c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c);
                         } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
                                    (c->fs_key->point_sprite_mask &
                                     (1 << semantic_index))) {
-                                emit_point_coord_input(c, loc);
+                                c->inputs[loc * 4 + 0] = c->point_x;
+                                c->inputs[loc * 4 + 1] = c->point_y;
                         } else {
                                 emit_fragment_input(c, loc,
                                                     semantic_name,
@@ -1770,6 +1394,13 @@ ntq_setup_outputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location * 4;
 
                 assert(array_len == 1);
+                (void)array_len;
+
+                /* NIR hack to pass through
+                 * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
+                if (semantic_name == TGSI_SEMANTIC_COLOR &&
+                    semantic_index == -1)
+                        semantic_index = 0;
 
                 for (int i = 0; i < 4; i++) {
                         add_output(c,
@@ -1834,8 +1465,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
-        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                          instr->def.num_components);
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
 
@@ -1843,47 +1473,59 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 }
 
 static void
+ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+        /* QIR needs there to be *some* value, so pick 0 (same as for
+         * ntq_setup_registers().
+         */
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = qir_uniform_ui(c, 0);
+}
+
+static void
 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
         const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
         struct qreg *dest = NULL;
 
         if (info->has_dest) {
-                dest = ntq_get_dest(c, instr->dest);
+                dest = ntq_get_dest(c, &instr->dest);
         }
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
-                                              instr->const_index[0] * 4 + i);
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) {
+                        *dest = qir_uniform(c, QUNIFORM_UNIFORM,
+                                            instr->const_index[0]);
+                } else {
+                        *dest = qir_uniform(c, instr->const_index[0] -
+                                            VC4_NIR_STATE_UNIFORM_OFFSET,
+                                            0);
                 }
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = indirect_uniform_load(c,
-                                                        ntq_get_src(c, instr->src[0], 0),
-                                                        (instr->const_index[0] *
-                                                         4 + i) * sizeof(float));
-                }
+                *dest = indirect_uniform_load(c, instr);
 
                 break;
 
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++)
-                        dest[i] = c->inputs[instr->const_index[0] * 4 + i];
-
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        *dest = qir_TLB_COLOR_READ(c);
+                } else {
+                        *dest = c->inputs[instr->const_index[0]];
+                }
                 break;
 
         case nir_intrinsic_store_output:
-                for (int i = 0; i < instr->num_components; i++) {
-                        c->outputs[instr->const_index[0] * 4 + i] =
-                                qir_MOV(c, ntq_get_src(c, instr->src[0], i));
-                }
-                c->num_outputs = MAX2(c->num_outputs,
-                                      instr->const_index[0] * 4 +
-                                      instr->num_components + 1);
+                assert(instr->num_components == 1);
+                c->outputs[instr->const_index[0]] =
+                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
                 break;
 
         case nir_intrinsic_discard:
@@ -1927,6 +1569,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
+        case nir_instr_type_ssa_undef:
+                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+                break;
+
         case nir_instr_type_tex:
                 ntq_emit_tex(c, nir_instr_as_tex(instr));
                 break;
@@ -2084,13 +1730,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        if (stage == QSTAGE_FRAG)
+                vc4_nir_lower_blend(c);
+        vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
+        nir_lower_load_const_to_scalar(c->s);
 
         vc4_optimize_nir(c->s);
 
         nir_remove_dead_variables(c->s);
 
-        nir_convert_from_ssa(c->s);
+        nir_convert_from_ssa(c->s, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
@@ -2187,6 +1837,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
         memcpy(uinfo->contents, c->uniform_contents,
                count * sizeof(*uinfo->contents));
         uinfo->num_texture_samples = c->num_texture_samples;
+
+        vc4_set_shader_uniform_dirty_flags(shader);
 }
 
 static struct vc4_compiled_shader *
@@ -2259,9 +1911,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         }
 
         copy_uniform_state_to_shader(shader, c);
-        shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts,
-                                      c->qpu_inst_count * sizeof(uint64_t),
-                                      "code");
+        shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
+                                         c->qpu_inst_count * sizeof(uint64_t));
 
         /* Copy the compiler UBO range state to the compiled shader, dropping
          * out arrays that were never referenced by an indirect load.
@@ -2288,10 +1939,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                 }
         }
         if (shader->ubo_size) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                        qir_get_stage_name(c->stage),
-                        c->program_id, c->variant_id,
-                        shader->ubo_size / 4);
+                if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                qir_get_stage_name(c->stage),
+                                c->program_id, c->variant_id,
+                                shader->ubo_size / 4);
+                }
         }
 
         qir_compile_destroy(c);
@@ -2421,9 +2074,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
                 (prim_mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex);
 
-        vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        struct vc4_compiled_shader *vs =
+                vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        if (vs != vc4->prog.vs) {
+                vc4->prog.vs = vs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_VS;
+        }
+
         key->is_coord = true;
-        vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        struct vc4_compiled_shader *cs =
+                vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        if (cs != vc4->prog.cs) {
+                vc4->prog.cs = cs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_CS;
+        }
 }
 
 void
@@ -2490,305 +2154,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         free(so);
 }
 
-static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
-{
-        switch (p_wrap) {
-        case PIPE_TEX_WRAP_REPEAT:
-                return 0;
-        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-                return 1;
-        case PIPE_TEX_WRAP_MIRROR_REPEAT:
-                return 2;
-        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-                return 3;
-        case PIPE_TEX_WRAP_CLAMP:
-                return (using_nearest ? 1 : 3);
-        default:
-                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
-                assert(!"not reached");
-                return 0;
-        }
-}
-
-static void
-write_texture_p0(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_reloc(vc4, &vc4->uniforms, rsc->bo,
-                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
-                 VC4_SET_FIELD(texture->u.tex.last_level -
-                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
-                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
-                               VC4_TEX_P0_CMMODE) |
-                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
-}
-
-static void
-write_texture_p1(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        static const uint8_t minfilter_map[6] = {
-                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
-                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
-                VC4_TEX_P1_MINFILT_NEAREST,
-                VC4_TEX_P1_MINFILT_LINEAR,
-        };
-        static const uint32_t magfilter_map[] = {
-                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
-                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
-        };
-
-        bool either_nearest =
-                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
-                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
-               VC4_SET_FIELD(texture->texture->height0 & 2047,
-                             VC4_TEX_P1_HEIGHT) |
-               VC4_SET_FIELD(texture->texture->width0 & 2047,
-                             VC4_TEX_P1_WIDTH) |
-               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
-                             VC4_TEX_P1_MAGFILT) |
-               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
-                                           sampler->min_img_filter],
-                             VC4_TEX_P1_MINFILT) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
-                             VC4_TEX_P1_WRAP_S) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
-                             VC4_TEX_P1_WRAP_T));
-}
-
-static void
-write_texture_p2(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t data)
-{
-        uint32_t unit = data & 0xffff;
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
-                             VC4_TEX_P2_PTYPE) |
-               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
-               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
-}
-
-
-#define SWIZ(x,y,z,w) {          \
-        UTIL_FORMAT_SWIZZLE_##x, \
-        UTIL_FORMAT_SWIZZLE_##y, \
-        UTIL_FORMAT_SWIZZLE_##z, \
-        UTIL_FORMAT_SWIZZLE_##w  \
-}
-
-static void
-write_texture_border_color(struct vc4_context *vc4,
-                           struct vc4_texture_stateobj *texstate,
-                           uint32_t unit)
-{
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        union util_color uc;
-
-        const struct util_format_description *tex_format_desc =
-                util_format_description(texture->format);
-
-        float border_color[4];
-        for (int i = 0; i < 4; i++)
-                border_color[i] = sampler->border_color.f[i];
-        if (util_format_is_srgb(texture->format)) {
-                for (int i = 0; i < 3; i++)
-                        border_color[i] =
-                                util_format_linear_to_srgb_float(border_color[i]);
-        }
-
-        /* Turn the border color into the layout of channels that it would
-         * have when stored as texture contents.
-         */
-        float storage_color[4];
-        util_format_unswizzle_4f(storage_color,
-                                 border_color,
-                                 tex_format_desc->swizzle);
-
-        /* Now, pack so that when the vc4_format-sampled texture contents are
-         * replaced with our border color, the vc4_get_format_swizzle()
-         * swizzling will get the right channels.
-         */
-        if (util_format_is_depth_or_stencil(texture->format)) {
-                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
-                                       sampler->border_color.f[0]) << 8;
-        } else {
-                switch (rsc->vc4_format) {
-                default:
-                case VC4_TEXTURE_TYPE_RGBA8888:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGBA4444:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGB565:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_ALPHA:
-                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
-                        break;
-                case VC4_TEXTURE_TYPE_LUMALPHA:
-                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
-                                    (float_to_ubyte(storage_color[0]) << 0));
-                        break;
-                }
-        }
-
-        cl_aligned_u32(&vc4->uniforms, uc.ui[0]);
-}
-
-static uint32_t
-get_texrect_scale(struct vc4_texture_stateobj *texstate,
-                  enum quniform_contents contents,
-                  uint32_t data)
-{
-        struct pipe_sampler_view *texture = texstate->textures[data];
-        uint32_t dim;
-
-        if (contents == QUNIFORM_TEXRECT_SCALE_X)
-                dim = texture->texture->width0;
-        else
-                dim = texture->texture->height0;
-
-        return fui(1.0f / dim);
-}
-
-static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-               const uint32_t *gallium_uniforms)
-{
-        if (!shader->ubo_size)
-                return NULL;
-
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
-        uint32_t *data = vc4_bo_map(ubo);
-        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
-                memcpy(data + shader->ubo_ranges[i].dst_offset,
-                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
-                       shader->ubo_ranges[i].size);
-        }
-
-        return ubo;
-}
-
-void
-vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-                   struct vc4_constbuf_stateobj *cb,
-                   struct vc4_texture_stateobj *texstate)
-{
-        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
-        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
-
-        cl_ensure_space(&vc4->uniforms, (uinfo->count +
-                                         uinfo->num_texture_samples) * 4);
-
-        cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
-
-        for (int i = 0; i < uinfo->count; i++) {
-
-                switch (uinfo->contents[i]) {
-                case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&vc4->uniforms, uinfo->data[i]);
-                        break;
-                case QUNIFORM_UNIFORM:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       gallium_uniforms[uinfo->data[i]]);
-                        break;
-                case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f);
-                        break;
-                case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f);
-                        break;
-
-                case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]);
-                        break;
-                case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]);
-                        break;
-
-                case QUNIFORM_USER_CLIP_PLANE:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0);
-                        break;
-
-                case QUNIFORM_TEXTURE_BORDER_COLOR:
-                        write_texture_border_color(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXRECT_SCALE_X:
-                case QUNIFORM_TEXRECT_SCALE_Y:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       get_texrect_scale(texstate,
-                                                         uinfo->contents[i],
-                                                         uinfo->data[i]));
-                        break;
-
-                case QUNIFORM_BLEND_CONST_COLOR:
-                        cl_aligned_f(&vc4->uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
-                        break;
-
-                case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
-                                        0));
-                        break;
-
-                case QUNIFORM_ALPHA_REF:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->zsa->base.alpha.ref_value);
-                        break;
-                }
-#if 0
-                uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4);
-                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
-                        shader, i, written_val, uif(written_val));
-#endif
-        }
-}
-
 static void
 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
-        [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
-        [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
-        [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
         [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
         [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
         [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
-bool
-qir_reads_r4(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_R4_UNPACK_A:
-        case QOP_R4_UNPACK_B:
-        case QOP_R4_UNPACK_C:
-        case QOP_R4_UNPACK_D:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 732cfd0b306..cade795c12a 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -36,6 +36,11 @@
 #include "util/list.h"
 #include "util/u_math.h"
 
+#include "vc4_screen.h"
+#include "pipe/p_state.h"
+
+struct nir_builder;
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -155,10 +160,6 @@ enum qop {
          * the destination
          */
         QOP_TEX_RESULT,
-        QOP_R4_UNPACK_A,
-        QOP_R4_UNPACK_B,
-        QOP_R4_UNPACK_C,
-        QOP_R4_UNPACK_D
 };
 
 struct queued_qpu_inst {
@@ -243,7 +244,11 @@ enum quniform_contents {
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_BLEND_CONST_COLOR,
+        QUNIFORM_BLEND_CONST_COLOR_X,
+        QUNIFORM_BLEND_CONST_COLOR_Y,
+        QUNIFORM_BLEND_CONST_COLOR_Z,
+        QUNIFORM_BLEND_CONST_COLOR_W,
+
         QUNIFORM_STENCIL,
 
         QUNIFORM_ALPHA_REF,
@@ -280,6 +285,52 @@ struct vc4_compiler_ubo_range {
         bool used;
 };
 
+struct vc4_key {
+        struct vc4_uncompiled_shader *shader_state;
+        struct {
+                enum pipe_format format;
+                unsigned compare_mode:1;
+                unsigned compare_func:3;
+                unsigned wrap_s:3;
+                unsigned wrap_t:3;
+                uint8_t swizzle[4];
+        } tex[VC4_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct vc4_fs_key {
+        struct vc4_key base;
+        enum pipe_format color_format;
+        bool depth_enabled;
+        bool stencil_enabled;
+        bool stencil_twoside;
+        bool stencil_full_writemasks;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct vc4_vs_key {
+        struct vc4_key base;
+
+        /**
+         * This is a proxy for the array of FS input semantics, which is
+         * larger than we would want to put in the key.
+         */
+        uint64_t compiled_fs_id;
+
+        enum pipe_format attr_formats[8];
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct vc4_compile {
         struct vc4_context *vc4;
         nir_shader *s;
@@ -369,6 +420,16 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
+
+/* Special offset for nir_load_uniform values to get a QUNIFORM_*
+ * state-dependent value.
+ */
+#define VC4_NIR_STATE_UNIFORM_OFFSET		2000000000
+
 struct vc4_compile *qir_compile_init(void);
 void qir_compile_destroy(struct vc4_compile *c);
 struct qinst *qir_inst(enum qop op, struct qreg dst,
@@ -393,7 +454,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
 bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
@@ -409,6 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_blend(struct vc4_compile *c);
+void vc4_nir_lower_io(struct vc4_compile *c);
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents);
+nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
+                                          nir_ssa_def **srcs, int swiz);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
@@ -523,27 +589,12 @@ QIR_ALU0(FRAG_W)
 QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
+QIR_NODST_1(TLB_COLOR_WRITE)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 
 static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
-        return t;
-}
-
-static inline struct qreg
-qir_SEL_X_0_COND(struct vc4_compile *c, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef));
-        return t;
-}
-
-static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
         struct qreg t = qir_get_temp(c);
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 910c89dca79..f087c3b81b5 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -52,7 +52,7 @@ static void
 add_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         if (entry) {
@@ -66,7 +66,7 @@ static void
 remove_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         assert(entry);
@@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct hash_entry *entry;
                 hash_table_foreach(ht, entry) {
                         uint32_t count = (uintptr_t)entry->data;
-                        uint32_t index = (uintptr_t)entry->key;
+                        uint32_t index = (uintptr_t)entry->key - 1;
                         if (count > max_count) {
                                 max_count = count;
                                 max_index = index;
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index c9ab6344589..fbb90ba12a0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); }
 static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); }
 static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); }
 
-uint64_t qpu_NOP(void);
-uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src);
-uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src);
+uint64_t qpu_NOP(void) ATTRIBUTE_CONST;
+uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
+uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
 uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
 uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
-uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
-uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
-uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
-uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond);
-uint32_t qpu_encode_small_immediate(uint32_t i);
-
-bool qpu_waddr_is_tlb(uint32_t waddr);
-bool qpu_inst_is_tlb(uint64_t inst);
-int qpu_num_sf_accesses(uint64_t inst);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST;
+uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST;
+uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST;
+
+bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST;
+bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
+int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
 void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
 
 static inline uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 55e0e6139b5..00aeb300a9b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -225,7 +225,7 @@ static const char *qpu_condflags[] = {
 };
 
 #define DESC(array, index)                                        \
-        ((index > ARRAY_SIZE(array) || !(array)[index]) ?         \
+        ((index >= ARRAY_SIZE(array) || !(array)[index]) ?         \
          "???" : (array)[index])
 
 static const char *
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 99afe4b8798..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VPM:
                                 assert((int)qinst->src[i].index >=
                                        last_vpm_read_index);
+                                (void)last_vpm_read_index;
                                 last_vpm_read_index = qinst->src[i].index;
                                 src[i] = qpu_ra(QPU_R_VPM);
                                 break;
@@ -319,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
 
                         break;
 
@@ -402,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
 
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
@@ -451,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-
-                        break;
-
-                case QOP_R4_UNPACK_A:
-                case QOP_R4_UNPACK_B:
-                case QOP_R4_UNPACK_C:
-                case QOP_R4_UNPACK_D:
-                        assert(src[0].mux == QPU_MUX_R4);
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_R4_UNPACK_A),
-                                                       QPU_UNPACK);
-
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_UNPACK_8A_F:
@@ -474,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8D_F:
                 case QOP_UNPACK_16A_F:
                 case QOP_UNPACK_16B_F: {
-                        assert(src[0].mux == QPU_MUX_A);
-
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                             qpu_rb(31) : dst),
-                                            src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_F],
-                                                       QPU_UNPACK);
+                        if (src[0].mux == QPU_MUX_R4) {
+                                queue(c, qpu_a_MOV(dst, src[0]));
+                                *last_inst(c) |= QPU_PM;
+                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+                                                               (qinst->op -
+                                                                QOP_UNPACK_8A_F),
+                                                               QPU_UNPACK);
+                        } else {
+                                assert(src[0].mux == QPU_MUX_A);
 
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                /* Since we're setting the pack bits, if the
+                                 * destination is in A it would get re-packed.
+                                 */
+                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                                     qpu_rb(31) : dst),
+                                                    src[0], src[0]));
+                                *last_inst(c) |=
+                                        QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                 QOP_UNPACK_8A_F],
+                                                      QPU_UNPACK);
+
+                                if (dst.mux == QPU_MUX_A) {
+                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                }
                         }
                 }
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 8471edbf62c..9cf6841f41c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -23,6 +23,13 @@
 
 #include "vc4_qpu.h"
 
+#ifdef NDEBUG
+/* Since most of our code is used in assert()s, don't warn about dead code. */
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 static bool
 writes_reg(uint64_t inst, uint32_t w)
 {
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3b0b890b66a..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
                 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                 /* R4 can't be written as a general purpose register. (it's
                  * TMU_NOSWAP as a write address).
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R4)
+                if (vc4_regs[i].mux == QPU_MUX_R4) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
                         continue;
+                }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
         }
 
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+        }
 
         ra_set_finalize(vc4->regs, NULL);
 }
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
         return a->priority - b->priority;
 }
 
+#define CLASS_BIT_A			(1 << 0)
+#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_R4			(1 << 2)
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
+        uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(g, i, vc4->reg_class_any);
-        }
-
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 temp_to_node[map[i].temp] = i;
         }
 
-        /* Figure out our register classes and preallocated registers*/
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (qir_writes_r4(inst)) {
+                        /* This instruction writes r4 (and optionally moves
+                         * its result to a temp), so nothing else can be
+                         * stored in r4 across it.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (def[i] < ip && use[i] > ip)
+                                        class_bits[i] &= ~CLASS_BIT_R4;
+                        }
+                } else {
+                        /* R4 can't be written as a general purpose
+                         * register. (it's TMU_NOSWAP as a write address).
+                         */
+                        if (inst->dst.file == QFILE_TEMP)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+                }
+
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_TEX_RESULT:
-                case QOP_TLB_COLOR_READ:
-                        assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
-                                        ACC_INDEX + 4);
-                        break;
-
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, temp_to_node[inst->dst.index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                         break;
 
                 default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 if (qir_src_needs_a_file(inst)) {
-                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                }
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                switch (class_bits[i]) {
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_any);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_R4:
+                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        break;
+                case CLASS_BIT_A:
+                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        break;
+                default:
+                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+                                i, class_bits[i]);
+                        abort();
+                        break;
                 }
         }
 
@@ -270,7 +315,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         bool ok = ra_allocate(g);
-        assert(ok);
+        if (!ok) {
+                fprintf(stderr, "Failed to register allocate:\n");
+                qir_dump(c);
+                abort();
+        }
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index cab76406055..5d5166fd818 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
                 vc4_resource_bo_alloc(rsc);
+
+                /* If it might be bound as one of our vertex buffers, make
+                 * sure we re-emit vertex buffer state.
+                 */
+                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                        vc4->dirty |= VC4_DIRTY_VTXBUF;
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
@@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                             prsc->height0 == box->height &&
                             prsc->depth0 == box->depth) {
                                 vc4_resource_bo_alloc(rsc);
+                                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                        vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
                                 vc4_flush(pctx);
                         }
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index ab8f5d3cd55..87571b75e8b 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -82,19 +82,19 @@ struct vc4_resource {
         struct pipe_resource *shadow_parent;
 };
 
-static INLINE struct vc4_resource *
+static inline struct vc4_resource *
 vc4_resource(struct pipe_resource *prsc)
 {
         return (struct vc4_resource *)prsc;
 }
 
-static INLINE struct vc4_surface *
+static inline struct vc4_surface *
 vc4_surface(struct pipe_surface *psurf)
 {
         return (struct vc4_surface *)psurf;
 }
 
-static INLINE struct vc4_transfer *
+static inline struct vc4_transfer *
 vc4_transfer(struct pipe_transfer *ptrans)
 {
         return (struct vc4_transfer *)ptrans;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index f63bead0fbb..2dee1d40e5f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -176,6 +176,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
                 return 0;
 
                 /* Stream output. */
@@ -489,6 +493,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
 {
         whandle->stride = stride;
 
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching or shadow update avoidance).
+         */
+        bo->private = false;
+
         switch (whandle->type) {
         case DRM_API_HANDLE_TYPE_SHARED:
                 return vc4_bo_flink(bo, &whandle->handle);
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index b58013dd2ee..7cfd236349d 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -74,11 +74,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
         struct vc4_bo **bos = vc4->bo_pointers.base;
 
         exec->bo_count = args->bo_handle_count;
-        exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state));
+        exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
 #if 0
                 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
 #endif
@@ -86,7 +87,16 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 
-                exec->bo[i].bo = obj;
+                exec->bo[i] = obj;
+
+                /* The kernel does this validation at shader create ioctl
+                 * time.
+                 */
+                if (strcmp(bo->name, "code") == 0) {
+                        drm_bo->validated_shader = vc4_validate_shader(obj);
+                        if (!drm_bo->validated_shader)
+                                abort();
+                }
         }
         return 0;
 }
@@ -95,7 +105,7 @@ static int
 vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
-                struct drm_gem_cma_object *obj = exec->bo[i].bo;
+                struct drm_gem_cma_object *obj = exec->bo[i];
                 struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 2bb36b253bb..68ace0216aa 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,6 +78,7 @@ struct drm_gem_cma_object {
 struct drm_vc4_bo {
         struct drm_gem_cma_object base;
         struct vc4_bo *bo;
+        struct vc4_validated_shader_info *validated_shader;
         struct list_head unref_head;
 };
 
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 4a1d4c3a4d6..8a759c2ca4c 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
          * BCM21553).
          */
-        so->point_size = MAX2(cso->point_size, .125);
+        so->point_size = MAX2(cso->point_size, .125f);
 
         if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;
@@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader)
         }
 }
 
+static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
+{
+        switch (p_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
+                assert(!"not reached");
+                return 0;
+        }
+}
+
 static void *
 vc4_create_sampler_state(struct pipe_context *pctx,
                          const struct pipe_sampler_state *cso)
 {
-        return vc4_generic_cso_state_create(cso, sizeof(*cso));
+        static const uint8_t minfilter_map[6] = {
+                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
+                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
+                VC4_TEX_P1_MINFILT_NEAREST,
+                VC4_TEX_P1_MINFILT_LINEAR,
+        };
+        static const uint32_t magfilter_map[] = {
+                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
+                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
+        };
+        bool either_nearest =
+                (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+        struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so, cso, sizeof(*cso));
+
+        so->texture_p1 =
+                (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter],
+                               VC4_TEX_P1_MAGFILT) |
+                 VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 +
+                                             cso->min_img_filter],
+                               VC4_TEX_P1_MINFILT) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest),
+                               VC4_TEX_P1_WRAP_S) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest),
+                               VC4_TEX_P1_WRAP_T));
+
+        return so;
 }
 
 static void
@@ -499,13 +552,13 @@ static struct pipe_sampler_view *
 vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                         const struct pipe_sampler_view *cso)
 {
-        struct pipe_sampler_view *so = malloc(sizeof(*so));
+        struct vc4_sampler_view *so = malloc(sizeof(*so));
         struct vc4_resource *rsc = vc4_resource(prsc);
 
         if (!so)
                 return NULL;
 
-        *so = *cso;
+        so->base = *cso;
 
         pipe_reference(NULL, &prsc->reference);
 
@@ -516,18 +569,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
          * Also, Raspberry Pi doesn't support sampling from raster textures,
          * so we also have to copy to a temporary then.
          */
-        if (so->u.tex.first_level ||
+        if (cso->u.tex.first_level ||
             rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
                 struct vc4_resource *shadow_parent = vc4_resource(prsc);
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
                 tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
-                tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
-                tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
-                tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
+                tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
+                tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
+                tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
@@ -535,11 +589,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
 
                 assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
-        so->texture = prsc;
-        so->reference.count = 1;
-        so->context = pctx;
-
-        return so;
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
+
+        so->texture_p0 =
+                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
+                 VC4_SET_FIELD(cso->u.tex.last_level -
+                               cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
+                 VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE,
+                               VC4_TEX_P0_CMMODE));
+        so->texture_p1 =
+                (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
+                 VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
+                 VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
+
+        return &so->base;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c
index f9801c9cefd..cf86eb0fa31 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/src/gallium/drivers/vc4/vc4_tiling.c
@@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
 static void
 check_box_utile_alignment(const struct pipe_box *box, int cpp)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-
-        assert(!(box->x & (utile_w - 1)));
-        assert(!(box->y & (utile_h - 1)));
-        assert(!(box->width & (utile_w - 1)));
-        assert(!(box->height & (utile_h - 1)));
+        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
+        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index b5d10da3417..b90bba70200 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,9 +24,9 @@
 #ifndef VC4_TILING_H
 #define VC4_TILING_H
 
-uint32_t vc4_utile_width(int cpp);
-uint32_t vc4_utile_height(int cpp);
-bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp);
+uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
 void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
 void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
new file mode 100644
index 00000000000..85d6998205e
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_pack_color.h"
+#include "util/format_srgb.h"
+
+#include "vc4_context.h"
+#include "vc4_qir.h"
+
+static void
+write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_resource *rsc = vc4_resource(sview->base.texture);
+
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0);
+}
+
+static void
+write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_sampler_state *sampler =
+                vc4_sampler_state(texstate->samplers[unit]);
+
+        cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1);
+}
+
+static void
+write_texture_p2(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t data)
+{
+        uint32_t unit = data & 0xffff;
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_u32(uniforms,
+               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
+                             VC4_TEX_P2_PTYPE) |
+               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
+               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
+}
+
+
+#define SWIZ(x,y,z,w) {          \
+        UTIL_FORMAT_SWIZZLE_##x, \
+        UTIL_FORMAT_SWIZZLE_##y, \
+        UTIL_FORMAT_SWIZZLE_##z, \
+        UTIL_FORMAT_SWIZZLE_##w  \
+}
+
+static void
+write_texture_border_color(struct vc4_context *vc4,
+                           struct vc4_cl_out **uniforms,
+                           struct vc4_texture_stateobj *texstate,
+                           uint32_t unit)
+{
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        union util_color uc;
+
+        const struct util_format_description *tex_format_desc =
+                util_format_description(texture->format);
+
+        float border_color[4];
+        for (int i = 0; i < 4; i++)
+                border_color[i] = sampler->border_color.f[i];
+        if (util_format_is_srgb(texture->format)) {
+                for (int i = 0; i < 3; i++)
+                        border_color[i] =
+                                util_format_linear_to_srgb_float(border_color[i]);
+        }
+
+        /* Turn the border color into the layout of channels that it would
+         * have when stored as texture contents.
+         */
+        float storage_color[4];
+        util_format_unswizzle_4f(storage_color,
+                                 border_color,
+                                 tex_format_desc->swizzle);
+
+        /* Now, pack so that when the vc4_format-sampled texture contents are
+         * replaced with our border color, the vc4_get_format_swizzle()
+         * swizzling will get the right channels.
+         */
+        if (util_format_is_depth_or_stencil(texture->format)) {
+                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
+                                       sampler->border_color.f[0]) << 8;
+        } else {
+                switch (rsc->vc4_format) {
+                default:
+                case VC4_TEXTURE_TYPE_RGBA8888:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGBA4444:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGB565:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_ALPHA:
+                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
+                        break;
+                case VC4_TEXTURE_TYPE_LUMALPHA:
+                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
+                                    (float_to_ubyte(storage_color[0]) << 0));
+                        break;
+                }
+        }
+
+        cl_aligned_u32(uniforms, uc.ui[0]);
+}
+
+static uint32_t
+get_texrect_scale(struct vc4_texture_stateobj *texstate,
+                  enum quniform_contents contents,
+                  uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+        uint32_t dim;
+
+        if (contents == QUNIFORM_TEXRECT_SCALE_X)
+                dim = texture->texture->width0;
+        else
+                dim = texture->texture->height0;
+
+        return fui(1.0f / dim);
+}
+
+static struct vc4_bo *
+vc4_upload_ubo(struct vc4_context *vc4,
+               struct vc4_compiled_shader *shader,
+               const uint32_t *gallium_uniforms)
+{
+        if (!shader->ubo_size)
+                return NULL;
+
+        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
+        uint32_t *data = vc4_bo_map(ubo);
+        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
+                memcpy(data + shader->ubo_ranges[i].dst_offset,
+                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
+                       shader->ubo_ranges[i].size);
+        }
+
+        return ubo;
+}
+
+void
+vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+                   struct vc4_constbuf_stateobj *cb,
+                   struct vc4_texture_stateobj *texstate)
+{
+        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
+
+        cl_ensure_space(&vc4->uniforms, (uinfo->count +
+                                         uinfo->num_texture_samples) * 4);
+
+        struct vc4_cl_out *uniforms =
+                cl_start_shader_reloc(&vc4->uniforms,
+                                      uinfo->num_texture_samples);
+
+        for (int i = 0; i < uinfo->count; i++) {
+
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        break;
+                case QUNIFORM_UNIFORM:
+                        cl_aligned_u32(&uniforms,
+                                       gallium_uniforms[uinfo->data[i]]);
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
+                        break;
+
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
+                        break;
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        cl_aligned_f(&uniforms,
+                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                        write_texture_p0(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        write_texture_p1(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                        write_texture_p2(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
+                        break;
+
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                        write_texture_border_color(vc4, &uniforms,
+                                                   texstate, uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        cl_aligned_u32(&uniforms,
+                                       get_texrect_scale(texstate,
+                                                         uinfo->contents[i],
+                                                         uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        cl_aligned_f(&uniforms,
+                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
+                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                           0, 1));
+                        break;
+
+                case QUNIFORM_STENCIL:
+                        cl_aligned_u32(&uniforms,
+                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
+                                       (uinfo->data[i] <= 1 ?
+                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                        0));
+                        break;
+
+                case QUNIFORM_ALPHA_REF:
+                        cl_aligned_f(&uniforms,
+                                     vc4->zsa->base.alpha.ref_value);
+                        break;
+                }
+#if 0
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
+                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
+                        shader, i, written_val, uif(written_val));
+#endif
+        }
+
+        cl_end(&vc4->uniforms, uniforms);
+
+        vc4_bo_unreference(&ubo);
+}
+
+void
+vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
+{
+        uint32_t dirty = 0;
+
+        for (int i = 0; i < shader->uniforms.count; i++) {
+                switch (shader->uniforms.contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        break;
+                case QUNIFORM_UNIFORM:
+                case QUNIFORM_UBO_ADDR:
+                        dirty |= VC4_DIRTY_CONSTBUF;
+                        break;
+
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        dirty |= VC4_DIRTY_VIEWPORT;
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        dirty |= VC4_DIRTY_CLIP;
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        dirty |= VC4_DIRTY_TEXSTATE;
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        dirty |= VC4_DIRTY_BLEND_COLOR;
+                        break;
+
+                case QUNIFORM_STENCIL:
+                case QUNIFORM_ALPHA_REF:
+                        dirty |= VC4_DIRTY_ZSA;
+                        break;
+                }
+        }
+
+        shader->uniform_dirty_bits = dirty;
+}