v3d: Rename the driver files from "vc5" to "v3d".

author: Eric Anholt <[email protected]> 2018-05-01 12:24:48 -0700
committer: Eric Anholt <[email protected]> 2018-05-16 21:19:07 +0100
commit: 8c47ebbd232704ab048eab2572e2b2a44f38957a (patch)
tree: 8946780fc424b3aa39e0b32ac875047605770a49 /src/gallium/drivers/v3d
parent: c4c488a2aeb24c0f468664c0cacd0d01111a4e46 (diff)
36 files changed, 11283 insertions, 0 deletions
diff --git a/src/gallium/drivers/v3d/.editorconfig b/src/gallium/drivers/v3d/.editorconfig
new file mode 100644
index 00000000000..5a9f3c041a4
--- /dev/null
+++ b/src/gallium/drivers/v3d/.editorconfig
@@ -0,0 +1,3 @@
+[*.{c,h,cpp}]
+indent_style = space
+indent_size = 8
diff --git a/src/gallium/drivers/v3d/Automake.inc b/src/gallium/drivers/v3d/Automake.inc
new file mode 100644
index 00000000000..7cf8ae7cd8b
--- /dev/null
+++ b/src/gallium/drivers/v3d/Automake.inc
@@ -0,0 +1,14 @@
+if HAVE_GALLIUM_V3D
+
+TARGET_DRIVERS += v3d
+TARGET_CPPFLAGS += -DGALLIUM_V3D
+TARGET_LIB_DEPS += \
+	$(top_builddir)/src/gallium/winsys/v3d/drm/libv3ddrm.la \
+	$(top_builddir)/src/gallium/drivers/v3d/libv3d.la \
+	$(top_builddir)/src/broadcom/libbroadcom.la
+
+if !HAVE_GALLIUM_VC4
+TARGET_LIB_DEPS += $(top_builddir)/src/broadcom/cle/libbroadcom_cle.la
+endif
+
+endif
diff --git a/src/gallium/drivers/v3d/Makefile.am b/src/gallium/drivers/v3d/Makefile.am
new file mode 100644
index 00000000000..2b4c364c24e
--- /dev/null
+++ b/src/gallium/drivers/v3d/Makefile.am
@@ -0,0 +1,56 @@
+# Copyright © 2014 Broadcom
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_builddir)/src/broadcom \
+	$(LIBDRM_CFLAGS) \
+	$(V3D_SIMULATOR_CFLAGS) \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(VALGRIND_CFLAGS) \
+	$()
+
+noinst_LTLIBRARIES = \
+	libv3d.la \
+	libv3d_v33.la \
+	libv3d_v41.la \
+	$()
+
+libv3d_v33_la_SOURCES = $(V3D_PER_VERSION_SOURCES)
+libv3d_v33_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=33
+
+libv3d_v41_la_SOURCES = $(V3D_PER_VERSION_SOURCES)
+libv3d_v41_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=41
+
+libv3d_la_SOURCES = $(C_SOURCES)
+
+libv3d_la_LDFLAGS = \
+	$(V3D_SIMULATOR_LIBS) \
+	$(NULL)
+libv3d_la_LIBADD = \
+	libv3d_v33.la \
+	libv3d_v41.la \
+	$()
+
+EXTRA_DIST = meson.build
diff --git a/src/gallium/drivers/v3d/Makefile.sources b/src/gallium/drivers/v3d/Makefile.sources
new file mode 100644
index 00000000000..c81ccb42013
--- /dev/null
+++ b/src/gallium/drivers/v3d/Makefile.sources
@@ -0,0 +1,36 @@
+C_SOURCES := \
+	v3d_blit.c \
+	v3d_bufmgr.c \
+	v3d_bufmgr.h \
+	v3d_cl.c \
+	v3d_cl.h \
+	v3d_context.c \
+	v3d_context.h \
+	v3d_fence.c \
+	v3d_formats.c \
+	v3d_format_table.h \
+	v3d_job.c \
+	v3d_program.c \
+	v3d_query.c \
+	v3d_resource.c \
+	v3d_resource.h \
+	v3d_screen.c \
+	v3d_screen.h \
+	v3d_simulator.c \
+	v3d_simulator_wrapper.cpp \
+	v3d_simulator_wrapper.h \
+	v3d_tiling.c \
+	v3d_tiling.h \
+	v3d_uniforms.c \
+	$()
+
+V3D_PER_VERSION_SOURCES = \
+	v3dx_context.h \
+	v3dx_draw.c \
+	v3dx_emit.c \
+	v3dx_format_table.c \
+	v3dx_job.c \
+	v3dx_rcl.c \
+	v3dx_simulator.c \
+	v3dx_state.c \
+	$()
diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
new file mode 100644
index 00000000000..38021515eda
--- /dev/null
+++ b/src/gallium/drivers/v3d/meson.build
@@ -0,0 +1,96 @@
+# Copyright © 2017 Broadcom
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+files_libv3d = files(
+  'v3d_blit.c',
+  'v3d_bufmgr.c',
+  'v3d_bufmgr.h',
+  'v3d_cl.c',
+  'v3d_cl.h',
+  'v3d_context.c',
+  'v3d_context.h',
+  'v3d_fence.c',
+  'v3d_formats.c',
+  'v3d_job.c',
+  'v3d_program.c',
+  'v3d_query.c',
+  'v3d_resource.c',
+  'v3d_resource.h',
+  'v3d_screen.c',
+  'v3d_screen.h',
+  'v3d_simulator.c',
+  'v3d_simulator_wrapper.cpp',
+  'v3d_tiling.c',
+  'v3d_tiling.h',
+  'v3d_uniforms.c',
+)
+
+files_per_version = files(
+  'v3dx_draw.c',
+  'v3dx_emit.c',
+  'v3dx_format_table.c',
+  'v3dx_job.c',
+  'v3dx_rcl.c',
+  'v3dx_simulator.c',
+  'v3dx_state.c',
+)
+
+v3dv3_c_args = []
+dep_v3dv3 = dependency('v3dv3')
+if dep_v3dv3.found()
+  v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
+endif
+
+v3d_versions = ['33', '41']
+
+per_version_libs = []
+foreach ver : v3d_versions
+  per_version_libs += static_library(
+    'v3d-v' + ver,
+    [files_per_version, v3d_xml_pack, nir_opcodes_h, nir_builder_opcodes_h],
+    include_directories : [
+      inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+      inc_gallium_drivers, inc_drm_uapi,
+    ],
+    c_args : [c_vis_args, v3dv3_c_args, '-DV3D_VERSION=' + ver],
+    cpp_args : [cpp_vis_args],
+    dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind],
+)
+
+endforeach
+
+libv3d = static_library(
+  'v3d',
+  [files_libv3d, v3d_xml_pack],
+  include_directories : [
+    inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+    inc_gallium_drivers, inc_drm_uapi,
+  ],
+  c_args : [c_vis_args, v3dv3_c_args],
+  cpp_args : [cpp_vis_args, v3dv3_c_args],
+  dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers],
+  link_with: per_version_libs,
+)
+
+driver_v3d = declare_dependency(
+  compile_args : '-DGALLIUM_V3D',
+  link_with : [libv3d, libv3dwinsys, libbroadcom_cle, libbroadcom_v3d],
+  dependencies : idep_nir,
+)
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
new file mode 100644
index 00000000000..7c67d4561ba
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright © 2015-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_surface.h"
+#include "util/u_blitter.h"
+#include "v3d_context.h"
+
+#if 0
+static struct pipe_surface *
+vc5_get_blit_surface(struct pipe_context *pctx,
+                     struct pipe_resource *prsc, unsigned level)
+{
+        struct pipe_surface tmpl;
+
+        memset(&tmpl, 0, sizeof(tmpl));
+        tmpl.format = prsc->format;
+        tmpl.u.tex.level = level;
+        tmpl.u.tex.first_layer = 0;
+        tmpl.u.tex.last_layer = 0;
+
+        return pctx->create_surface(pctx, prsc, &tmpl);
+}
+
+static bool
+is_tile_unaligned(unsigned size, unsigned tile_size)
+{
+        return size & (tile_size - 1);
+}
+
+static bool
+vc5_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        bool msaa = (info->src.resource->nr_samples > 1 ||
+                     info->dst.resource->nr_samples > 1);
+        int tile_width = msaa ? 32 : 64;
+        int tile_height = msaa ? 32 : 64;
+
+        if (util_format_is_depth_or_stencil(info->dst.resource->format))
+                return false;
+
+        if (info->scissor_enable)
+                return false;
+
+        if ((info->mask & PIPE_MASK_RGBA) == 0)
+                return false;
+
+        if (info->dst.box.x != info->src.box.x ||
+            info->dst.box.y != info->src.box.y ||
+            info->dst.box.width != info->src.box.width ||
+            info->dst.box.height != info->src.box.height) {
+                return false;
+        }
+
+        int dst_surface_width = u_minify(info->dst.resource->width0,
+                                         info->dst.level);
+        int dst_surface_height = u_minify(info->dst.resource->height0,
+                                         info->dst.level);
+        if (is_tile_unaligned(info->dst.box.x, tile_width) ||
+            is_tile_unaligned(info->dst.box.y, tile_height) ||
+            (is_tile_unaligned(info->dst.box.width, tile_width) &&
+             info->dst.box.x + info->dst.box.width != dst_surface_width) ||
+            (is_tile_unaligned(info->dst.box.height, tile_height) &&
+             info->dst.box.y + info->dst.box.height != dst_surface_height)) {
+                return false;
+        }
+
+        /* VC5_PACKET_LOAD_TILE_BUFFER_GENERAL uses the
+         * VC5_PACKET_TILE_RENDERING_MODE_CONFIG's width (determined by our
+         * destination surface) to determine the stride.  This may be wrong
+         * when reading from texture miplevels > 0, which are stored in
+         * POT-sized areas.  For MSAA, the tile addresses are computed
+         * explicitly by the RCL, but still use the destination width to
+         * determine the stride (which could be fixed by explicitly supplying
+         * it in the ABI).
+         */
+        struct vc5_resource *rsc = vc5_resource(info->src.resource);
+
+        uint32_t stride;
+
+        if (info->src.resource->nr_samples > 1)
+                stride = align(dst_surface_width, 32) * 4 * rsc->cpp;
+        /* XXX else if (rsc->slices[info->src.level].tiling == VC5_TILING_FORMAT_T)
+           stride = align(dst_surface_width * rsc->cpp, 128); */
+        else
+                stride = align(dst_surface_width * rsc->cpp, 16);
+
+        if (stride != rsc->slices[info->src.level].stride)
+                return false;
+
+        if (info->dst.resource->format != info->src.resource->format)
+                return false;
+
+        if (false) {
+                fprintf(stderr, "RCL blit from %d,%d to %d,%d (%d,%d)\n",
+                        info->src.box.x,
+                        info->src.box.y,
+                        info->dst.box.x,
+                        info->dst.box.y,
+                        info->dst.box.width,
+                        info->dst.box.height);
+        }
+
+        struct pipe_surface *dst_surf =
+                vc5_get_blit_surface(pctx, info->dst.resource, info->dst.level);
+        struct pipe_surface *src_surf =
+                vc5_get_blit_surface(pctx, info->src.resource, info->src.level);
+
+        vc5_flush_jobs_reading_resource(vc5, info->src.resource);
+
+        struct vc5_job *job = vc5_get_job(vc5, dst_surf, NULL);
+        pipe_surface_reference(&job->color_read, src_surf);
+
+        /* If we're resolving from MSAA to single sample, we still need to run
+         * the engine in MSAA mode for the load.
+         */
+        if (!job->msaa && info->src.resource->nr_samples > 1) {
+                job->msaa = true;
+                job->tile_width = 32;
+                job->tile_height = 32;
+        }
+
+        job->draw_min_x = info->dst.box.x;
+        job->draw_min_y = info->dst.box.y;
+        job->draw_max_x = info->dst.box.x + info->dst.box.width;
+        job->draw_max_y = info->dst.box.y + info->dst.box.height;
+        job->draw_width = dst_surf->width;
+        job->draw_height = dst_surf->height;
+
+        job->tile_width = tile_width;
+        job->tile_height = tile_height;
+        job->msaa = msaa;
+        job->needs_flush = true;
+        job->resolve |= PIPE_CLEAR_COLOR;
+
+        vc5_job_submit(vc5, job);
+
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_surface_reference(&src_surf, NULL);
+
+        return true;
+}
+#endif
+
+void
+vc5_blitter_save(struct vc5_context *vc5)
+{
+        util_blitter_save_fragment_constant_buffer_slot(vc5->blitter,
+                                                        vc5->constbuf[PIPE_SHADER_FRAGMENT].cb);
+        util_blitter_save_vertex_buffer_slot(vc5->blitter, vc5->vertexbuf.vb);
+        util_blitter_save_vertex_elements(vc5->blitter, vc5->vtx);
+        util_blitter_save_vertex_shader(vc5->blitter, vc5->prog.bind_vs);
+        util_blitter_save_so_targets(vc5->blitter, vc5->streamout.num_targets,
+                                     vc5->streamout.targets);
+        util_blitter_save_rasterizer(vc5->blitter, vc5->rasterizer);
+        util_blitter_save_viewport(vc5->blitter, &vc5->viewport);
+        util_blitter_save_scissor(vc5->blitter, &vc5->scissor);
+        util_blitter_save_fragment_shader(vc5->blitter, vc5->prog.bind_fs);
+        util_blitter_save_blend(vc5->blitter, vc5->blend);
+        util_blitter_save_depth_stencil_alpha(vc5->blitter, vc5->zsa);
+        util_blitter_save_stencil_ref(vc5->blitter, &vc5->stencil_ref);
+        util_blitter_save_sample_mask(vc5->blitter, vc5->sample_mask);
+        util_blitter_save_framebuffer(vc5->blitter, &vc5->framebuffer);
+        util_blitter_save_fragment_sampler_states(vc5->blitter,
+                        vc5->fragtex.num_samplers,
+                        (void **)vc5->fragtex.samplers);
+        util_blitter_save_fragment_sampler_views(vc5->blitter,
+                        vc5->fragtex.num_textures, vc5->fragtex.textures);
+        util_blitter_save_so_targets(vc5->blitter, vc5->streamout.num_targets,
+                                     vc5->streamout.targets);
+}
+
+static bool
+vc5_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
+{
+        struct vc5_context *vc5 = vc5_context(ctx);
+
+        if (!util_blitter_is_blit_supported(vc5->blitter, info)) {
+                fprintf(stderr, "blit unsupported %s -> %s\n",
+                    util_format_short_name(info->src.resource->format),
+                    util_format_short_name(info->dst.resource->format));
+                return false;
+        }
+
+        vc5_blitter_save(vc5);
+        util_blitter_blit(vc5->blitter, info);
+
+        return true;
+}
+
+/* Implement stencil blits by reinterpreting the stencil data as an RGBA8888
+ * or R8 texture.
+ */
+static void
+vc5_stencil_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
+{
+        struct vc5_context *vc5 = vc5_context(ctx);
+        struct vc5_resource *src = vc5_resource(info->src.resource);
+        struct vc5_resource *dst = vc5_resource(info->dst.resource);
+        enum pipe_format src_format, dst_format;
+
+        if (src->separate_stencil) {
+                src = src->separate_stencil;
+                src_format = PIPE_FORMAT_R8_UNORM;
+        } else {
+                src_format = PIPE_FORMAT_RGBA8888_UNORM;
+        }
+
+        if (dst->separate_stencil) {
+                dst = dst->separate_stencil;
+                dst_format = PIPE_FORMAT_R8_UNORM;
+        } else {
+                dst_format = PIPE_FORMAT_RGBA8888_UNORM;
+        }
+
+        /* Initialize the surface. */
+        struct pipe_surface dst_tmpl = {
+                .u.tex = {
+                        .level = info->dst.level,
+                        .first_layer = info->dst.box.z,
+                        .last_layer = info->dst.box.z,
+                },
+                .format = dst_format,
+        };
+        struct pipe_surface *dst_surf =
+                ctx->create_surface(ctx, &dst->base, &dst_tmpl);
+
+        /* Initialize the sampler view. */
+        struct pipe_sampler_view src_tmpl = {
+                .target = src->base.target,
+                .format = src_format,
+                .u.tex = {
+                        .first_level = info->src.level,
+                        .last_level = info->src.level,
+                        .first_layer = 0,
+                        .last_layer = (PIPE_TEXTURE_3D ?
+                                       u_minify(src->base.depth0,
+                                                info->src.level) - 1 :
+                                       src->base.array_size - 1),
+                },
+                .swizzle_r = PIPE_SWIZZLE_X,
+                .swizzle_g = PIPE_SWIZZLE_Y,
+                .swizzle_b = PIPE_SWIZZLE_Z,
+                .swizzle_a = PIPE_SWIZZLE_W,
+        };
+        struct pipe_sampler_view *src_view =
+                ctx->create_sampler_view(ctx, &src->base, &src_tmpl);
+
+        vc5_blitter_save(vc5);
+        util_blitter_blit_generic(vc5->blitter, dst_surf, &info->dst.box,
+                                  src_view, &info->src.box,
+                                  src->base.width0, src->base.height0,
+                                  PIPE_MASK_R,
+                                  PIPE_TEX_FILTER_NEAREST,
+                                  info->scissor_enable ? &info->scissor : NULL,
+                                  info->alpha_blend);
+
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_sampler_view_reference(&src_view, NULL);
+}
+
+/* Optimal hardware path for blitting pixels.
+ * Scaling, format conversion, up- and downsampling (resolve) are allowed.
+ */
+void
+vc5_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
+{
+        struct pipe_blit_info info = *blit_info;
+
+        if (info.mask & PIPE_MASK_S) {
+                vc5_stencil_blit(pctx, blit_info);
+                info.mask &= ~PIPE_MASK_S;
+        }
+
+#if 0
+        if (vc5_tile_blit(pctx, blit_info))
+                return;
+#endif
+
+        vc5_render_blit(pctx, &info);
+}
diff --git a/src/gallium/drivers/v3d/v3d_bufmgr.c b/src/gallium/drivers/v3d/v3d_bufmgr.c
new file mode 100644
index 00000000000..ef2a5fa07be
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_bufmgr.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <err.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#include "util/u_hash_table.h"
+#include "util/u_memory.h"
+#include "util/ralloc.h"
+
+#include "v3d_context.h"
+#include "v3d_screen.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+static bool dump_stats = false;
+
+static void
+vc5_bo_cache_free_all(struct vc5_bo_cache *cache);
+
+static void
+vc5_bo_dump_stats(struct vc5_screen *screen)
+{
+        struct vc5_bo_cache *cache = &screen->bo_cache;
+
+        fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
+        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 1024);
+        fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 1024);
+
+        if (!list_empty(&cache->time_list)) {
+                struct vc5_bo *first = LIST_ENTRY(struct vc5_bo,
+                                                  cache->time_list.next,
+                                                  time_list);
+                struct vc5_bo *last = LIST_ENTRY(struct vc5_bo,
+                                                  cache->time_list.prev,
+                                                  time_list);
+
+                fprintf(stderr, "  oldest cache time: %ld\n",
+                        (long)first->free_time);
+                fprintf(stderr, "  newest cache time: %ld\n",
+                        (long)last->free_time);
+
+                struct timespec time;
+                clock_gettime(CLOCK_MONOTONIC, &time);
+                fprintf(stderr, "  now:               %ld\n",
+                        time.tv_sec);
+        }
+}
+
+static void
+vc5_bo_remove_from_cache(struct vc5_bo_cache *cache, struct vc5_bo *bo)
+{
+        list_del(&bo->time_list);
+        list_del(&bo->size_list);
+        cache->bo_count--;
+        cache->bo_size -= bo->size;
+}
+
+static struct vc5_bo *
+vc5_bo_from_cache(struct vc5_screen *screen, uint32_t size, const char *name)
+{
+        struct vc5_bo_cache *cache = &screen->bo_cache;
+        uint32_t page_index = size / 4096 - 1;
+
+        if (cache->size_list_size <= page_index)
+                return NULL;
+
+        struct vc5_bo *bo = NULL;
+        mtx_lock(&cache->lock);
+        if (!list_empty(&cache->size_list[page_index])) {
+                bo = LIST_ENTRY(struct vc5_bo, cache->size_list[page_index].next,
+                                size_list);
+
+                /* Check that the BO has gone idle.  If not, then we want to
+                 * allocate something new instead, since we assume that the
+                 * user will proceed to CPU map it and fill it with stuff.
+                 */
+                if (!vc5_bo_wait(bo, 0, NULL)) {
+                        mtx_unlock(&cache->lock);
+                        return NULL;
+                }
+
+                pipe_reference_init(&bo->reference, 1);
+                vc5_bo_remove_from_cache(cache, bo);
+
+                bo->name = name;
+        }
+        mtx_unlock(&cache->lock);
+        return bo;
+}
+
+struct vc5_bo *
+vc5_bo_alloc(struct vc5_screen *screen, uint32_t size, const char *name)
+{
+        struct vc5_bo *bo;
+        int ret;
+
+        size = align(size, 4096);
+
+        bo = vc5_bo_from_cache(screen, size, name);
+        if (bo) {
+                if (dump_stats) {
+                        fprintf(stderr, "Allocated %s %dkb from cache:\n",
+                                name, size / 1024);
+                        vc5_bo_dump_stats(screen);
+                }
+                return bo;
+        }
+
+        bo = CALLOC_STRUCT(vc5_bo);
+        if (!bo)
+                return NULL;
+
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->size = size;
+        bo->name = name;
+        bo->private = true;
+
+ retry:
+        ;
+
+        bool cleared_and_retried = false;
+        struct drm_v3d_create_bo create = {
+                .size = size
+        };
+
+        ret = vc5_ioctl(screen->fd, DRM_IOCTL_V3D_CREATE_BO, &create);
+        bo->handle = create.handle;
+        bo->offset = create.offset;
+
+        if (ret != 0) {
+                if (!list_empty(&screen->bo_cache.time_list) &&
+                    !cleared_and_retried) {
+                        cleared_and_retried = true;
+                        vc5_bo_cache_free_all(&screen->bo_cache);
+                        goto retry;
+                }
+
+                free(bo);
+                return NULL;
+        }
+
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated %s %dkb:\n", name, size / 1024);
+                vc5_bo_dump_stats(screen);
+        }
+
+        return bo;
+}
+
+void
+vc5_bo_last_unreference(struct vc5_bo *bo)
+{
+        struct vc5_screen *screen = bo->screen;
+
+        struct timespec time;
+        clock_gettime(CLOCK_MONOTONIC, &time);
+        mtx_lock(&screen->bo_cache.lock);
+        vc5_bo_last_unreference_locked_timed(bo, time.tv_sec);
+        mtx_unlock(&screen->bo_cache.lock);
+}
+
+static void
+vc5_bo_free(struct vc5_bo *bo)
+{
+        struct vc5_screen *screen = bo->screen;
+
+        if (bo->map) {
+                if (using_vc5_simulator && bo->name &&
+                    strcmp(bo->name, "winsys") == 0) {
+                        free(bo->map);
+                } else {
+                        munmap(bo->map, bo->size);
+                        VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
+                }
+        }
+
+        struct drm_gem_close c;
+        memset(&c, 0, sizeof(c));
+        c.handle = bo->handle;
+        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
+        if (ret != 0)
+                fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
+
+        screen->bo_count--;
+        screen->bo_size -= bo->size;
+
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s%s%dkb:\n",
+                        bo->name ? bo->name : "",
+                        bo->name ? " " : "",
+                        bo->size / 1024);
+                vc5_bo_dump_stats(screen);
+        }
+
+        free(bo);
+}
+
+static void
+free_stale_bos(struct vc5_screen *screen, time_t time)
+{
+        struct vc5_bo_cache *cache = &screen->bo_cache;
+        bool freed_any = false;
+
+        list_for_each_entry_safe(struct vc5_bo, bo, &cache->time_list,
+                                 time_list) {
+                if (dump_stats && !freed_any) {
+                        fprintf(stderr, "Freeing stale BOs:\n");
+                        vc5_bo_dump_stats(screen);
+                        freed_any = true;
+                }
+
+                /* If it's more than a second old, free it. */
+                if (time - bo->free_time > 2) {
+                        vc5_bo_remove_from_cache(cache, bo);
+                        vc5_bo_free(bo);
+                } else {
+                        break;
+                }
+        }
+
+        if (dump_stats && freed_any) {
+                fprintf(stderr, "Freed stale BOs:\n");
+                vc5_bo_dump_stats(screen);
+        }
+}
+
+static void
+vc5_bo_cache_free_all(struct vc5_bo_cache *cache)
+{
+        mtx_lock(&cache->lock);
+        list_for_each_entry_safe(struct vc5_bo, bo, &cache->time_list,
+                                 time_list) {
+                vc5_bo_remove_from_cache(cache, bo);
+                vc5_bo_free(bo);
+        }
+        mtx_unlock(&cache->lock);
+}
+
+void
+vc5_bo_last_unreference_locked_timed(struct vc5_bo *bo, time_t time)
+{
+        struct vc5_screen *screen = bo->screen;
+        struct vc5_bo_cache *cache = &screen->bo_cache;
+        uint32_t page_index = bo->size / 4096 - 1;
+
+        if (!bo->private) {
+                vc5_bo_free(bo);
+                return;
+        }
+
+        if (cache->size_list_size <= page_index) {
+                struct list_head *new_list =
+                        ralloc_array(screen, struct list_head, page_index + 1);
+
+                /* Move old list contents over (since the array has moved, and
+                 * therefore the pointers to the list heads have to change).
+                 */
+                for (int i = 0; i < cache->size_list_size; i++) {
+                        struct list_head *old_head = &cache->size_list[i];
+                        if (list_empty(old_head))
+                                list_inithead(&new_list[i]);
+                        else {
+                                new_list[i].next = old_head->next;
+                                new_list[i].prev = old_head->prev;
+                                new_list[i].next->prev = &new_list[i];
+                                new_list[i].prev->next = &new_list[i];
+                        }
+                }
+                for (int i = cache->size_list_size; i < page_index + 1; i++)
+                        list_inithead(&new_list[i]);
+
+                cache->size_list = new_list;
+                cache->size_list_size = page_index + 1;
+        }
+
+        bo->free_time = time;
+        list_addtail(&bo->size_list, &cache->size_list[page_index]);
+        list_addtail(&bo->time_list, &cache->time_list);
+        cache->bo_count++;
+        cache->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s %dkb to cache:\n",
+                        bo->name, bo->size / 1024);
+                vc5_bo_dump_stats(screen);
+        }
+        bo->name = NULL;
+
+        free_stale_bos(screen, time);
+}
+
+static struct vc5_bo *
+vc5_bo_open_handle(struct vc5_screen *screen,
+                   uint32_t winsys_stride,
+                   uint32_t handle, uint32_t size)
+{
+        struct vc5_bo *bo;
+
+        assert(size);
+
+        mtx_lock(&screen->bo_handles_mutex);
+
+        bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle);
+        if (bo) {
+                pipe_reference(NULL, &bo->reference);
+                goto done;
+        }
+
+        bo = CALLOC_STRUCT(vc5_bo);
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->handle = handle;
+        bo->size = size;
+        bo->name = "winsys";
+        bo->private = false;
+
+#ifdef USE_V3D_SIMULATOR
+        vc5_simulator_open_from_handle(screen->fd, winsys_stride,
+                                       bo->handle, bo->size);
+        bo->map = malloc(bo->size);
+#endif
+
+        struct drm_v3d_get_bo_offset get = {
+                .handle = handle,
+        };
+        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_V3D_GET_BO_OFFSET, &get);
+        if (ret) {
+                fprintf(stderr, "Failed to get BO offset: %s\n",
+                        strerror(errno));
+                free(bo->map);
+                free(bo);
+                return NULL;
+        }
+        bo->offset = get.offset;
+        assert(bo->offset != 0);
+
+        util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo);
+
+done:
+        mtx_unlock(&screen->bo_handles_mutex);
+        return bo;
+}
+
+struct vc5_bo *
+vc5_bo_open_name(struct vc5_screen *screen, uint32_t name,
+                 uint32_t winsys_stride)
+{
+        struct drm_gem_open o = {
+                .name = name
+        };
+        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
+        if (ret) {
+                fprintf(stderr, "Failed to open bo %d: %s\n",
+                        name, strerror(errno));
+                return NULL;
+        }
+
+        return vc5_bo_open_handle(screen, winsys_stride, o.handle, o.size);
+}
+
+struct vc5_bo *
+vc5_bo_open_dmabuf(struct vc5_screen *screen, int fd, uint32_t winsys_stride)
+{
+        uint32_t handle;
+        int ret = drmPrimeFDToHandle(screen->fd, fd, &handle);
+        int size;
+        if (ret) {
+                fprintf(stderr, "Failed to get vc5 handle for dmabuf %d\n", fd);
+                return NULL;
+        }
+
+        /* Determine the size of the bo we were handed. */
+        size = lseek(fd, 0, SEEK_END);
+        if (size == -1) {
+                fprintf(stderr, "Couldn't get size of dmabuf fd %d.\n", fd);
+                return NULL;
+        }
+
+        return vc5_bo_open_handle(screen, winsys_stride, handle, size);
+}
+
+int
+vc5_bo_get_dmabuf(struct vc5_bo *bo)
+{
+        int fd;
+        int ret = drmPrimeHandleToFD(bo->screen->fd, bo->handle,
+                                     O_CLOEXEC, &fd);
+        if (ret != 0) {
+                fprintf(stderr, "Failed to export gem bo %d to dmabuf\n",
+                        bo->handle);
+                return -1;
+        }
+
+        mtx_lock(&bo->screen->bo_handles_mutex);
+        bo->private = false;
+        util_hash_table_set(bo->screen->bo_handles, (void *)(uintptr_t)bo->handle, bo);
+        mtx_unlock(&bo->screen->bo_handles_mutex);
+
+        return fd;
+}
+
+bool
+vc5_bo_flink(struct vc5_bo *bo, uint32_t *name)
+{
+        struct drm_gem_flink flink = {
+                .handle = bo->handle,
+        };
+        int ret = vc5_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
+        if (ret) {
+                fprintf(stderr, "Failed to flink bo %d: %s\n",
+                        bo->handle, strerror(errno));
+                free(bo);
+                return false;
+        }
+
+        bo->private = false;
+        *name = flink.name;
+
+        return true;
+}
+
+static int vc5_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+{
+        struct drm_v3d_wait_bo wait = {
+                .handle = handle,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = vc5_ioctl(fd, DRM_IOCTL_V3D_WAIT_BO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
+}
+
+bool
+vc5_bo_wait(struct vc5_bo *bo, uint64_t timeout_ns, const char *reason)
+{
+        struct vc5_screen *screen = bo->screen;
+
+        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc5_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on %s BO for %s\n",
+                                bo->name, reason);
+                }
+        }
+
+        int ret = vc5_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
+
+                return false;
+        }
+
+        return true;
+}
+
+void *
+vc5_bo_map_unsynchronized(struct vc5_bo *bo)
+{
+        uint64_t offset;
+        int ret;
+
+        if (bo->map)
+                return bo->map;
+
+        struct drm_v3d_mmap_bo map;
+        memset(&map, 0, sizeof(map));
+        map.handle = bo->handle;
+        ret = vc5_ioctl(bo->screen->fd, DRM_IOCTL_V3D_MMAP_BO, &map);
+        offset = map.offset;
+        if (ret != 0) {
+                fprintf(stderr, "map ioctl failure\n");
+                abort();
+        }
+
+        bo->map = mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                       bo->screen->fd, offset);
+        if (bo->map == MAP_FAILED) {
+                fprintf(stderr, "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
+                        bo->handle, (long long)offset, bo->size);
+                abort();
+        }
+        VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, false));
+
+        return bo->map;
+}
+
+void *
+vc5_bo_map(struct vc5_bo *bo)
+{
+        void *map = vc5_bo_map_unsynchronized(bo);
+
+        bool ok = vc5_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
+        if (!ok) {
+                fprintf(stderr, "BO wait for map failed\n");
+                abort();
+        }
+
+        return map;
+}
+
+void
+vc5_bufmgr_destroy(struct pipe_screen *pscreen)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_bo_cache *cache = &screen->bo_cache;
+
+        vc5_bo_cache_free_all(cache);
+
+        if (dump_stats) {
+                fprintf(stderr, "BO stats after screen destroy:\n");
+                vc5_bo_dump_stats(screen);
+        }
+}
diff --git a/src/gallium/drivers/v3d/v3d_bufmgr.h b/src/gallium/drivers/v3d/v3d_bufmgr.h
new file mode 100644
index 00000000000..4519a206026
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_bufmgr.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_BUFMGR_H
+#define VC5_BUFMGR_H
+
+#include <stdint.h>
+#include "util/u_hash_table.h"
+#include "util/u_inlines.h"
+#include "util/list.h"
+#include "v3d_screen.h"
+
+struct vc5_context;
+
+struct vc5_bo {
+        struct pipe_reference reference;
+        struct vc5_screen *screen;
+        void *map;
+        const char *name;
+        uint32_t handle;
+        uint32_t size;
+
+        /* Address of the BO in our page tables. */
+        uint32_t offset;
+
+        /** Entry in the linked list of buffers freed, by age. */
+        struct list_head time_list;
+        /** Entry in the per-page-count linked list of buffers freed (by age). */
+        struct list_head size_list;
+        /** Approximate second when the bo was freed. */
+        time_t free_time;
+        /**
+         * Whether only our process has a reference to the BO (meaning that
+         * it's safe to reuse it in the BO cache).
+         */
+        bool private;
+};
+
+struct vc5_bo *vc5_bo_alloc(struct vc5_screen *screen, uint32_t size,
+                            const char *name);
+void vc5_bo_last_unreference(struct vc5_bo *bo);
+void vc5_bo_last_unreference_locked_timed(struct vc5_bo *bo, time_t time);
+struct vc5_bo *vc5_bo_open_name(struct vc5_screen *screen, uint32_t name,
+                                uint32_t winsys_stride);
+struct vc5_bo *vc5_bo_open_dmabuf(struct vc5_screen *screen, int fd,
+                                  uint32_t winsys_stride);
+bool vc5_bo_flink(struct vc5_bo *bo, uint32_t *name);
+int vc5_bo_get_dmabuf(struct vc5_bo *bo);
+
+static inline void
+vc5_bo_set_reference(struct vc5_bo **old_bo, struct vc5_bo *new_bo)
+{
+        if (pipe_reference(&(*old_bo)->reference, &new_bo->reference))
+                vc5_bo_last_unreference(*old_bo);
+        *old_bo = new_bo;
+}
+
+static inline struct vc5_bo *
+vc5_bo_reference(struct vc5_bo *bo)
+{
+        pipe_reference(NULL, &bo->reference);
+        return bo;
+}
+
+static inline void
+vc5_bo_unreference(struct vc5_bo **bo)
+{
+        struct vc5_screen *screen;
+        if (!*bo)
+                return;
+
+        if ((*bo)->private) {
+                /* Avoid the mutex for private BOs */
+                if (pipe_reference(&(*bo)->reference, NULL))
+                        vc5_bo_last_unreference(*bo);
+        } else {
+                screen = (*bo)->screen;
+                mtx_lock(&screen->bo_handles_mutex);
+
+                if (pipe_reference(&(*bo)->reference, NULL)) {
+                        util_hash_table_remove(screen->bo_handles,
+                                               (void *)(uintptr_t)(*bo)->handle);
+                        vc5_bo_last_unreference(*bo);
+                }
+
+                mtx_unlock(&screen->bo_handles_mutex);
+        }
+
+        *bo = NULL;
+}
+
+static inline void
+vc5_bo_unreference_locked_timed(struct vc5_bo **bo, time_t time)
+{
+        if (!*bo)
+                return;
+
+        if (pipe_reference(&(*bo)->reference, NULL))
+                vc5_bo_last_unreference_locked_timed(*bo, time);
+        *bo = NULL;
+}
+
+void *
+vc5_bo_map(struct vc5_bo *bo);
+
+void *
+vc5_bo_map_unsynchronized(struct vc5_bo *bo);
+
+bool
+vc5_bo_wait(struct vc5_bo *bo, uint64_t timeout_ns, const char *reason);
+
+bool
+vc5_wait_seqno(struct vc5_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason);
+
+void
+vc5_bufmgr_destroy(struct pipe_screen *pscreen);
+
+#endif /* VC5_BUFMGR_H */
+
diff --git a/src/gallium/drivers/v3d/v3d_cl.c b/src/gallium/drivers/v3d/v3d_cl.c
new file mode 100644
index 00000000000..2ffb7ea9a2c
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_cl.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_math.h"
+#include "util/ralloc.h"
+#include "v3d_context.h"
+/* The branching packets are the same across V3D versions. */
+#define V3D_VERSION 33
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+void
+vc5_init_cl(struct vc5_job *job, struct vc5_cl *cl)
+{
+        cl->base = NULL;
+        cl->next = cl->base;
+        cl->size = 0;
+        cl->job = job;
+}
+
+uint32_t
+vc5_cl_ensure_space(struct vc5_cl *cl, uint32_t space, uint32_t alignment)
+{
+        uint32_t offset = align(cl_offset(cl), alignment);
+
+        if (offset + space <= cl->size) {
+                cl->next = cl->base + offset;
+                return offset;
+        }
+
+        vc5_bo_unreference(&cl->bo);
+        cl->bo = vc5_bo_alloc(cl->job->vc5->screen, align(space, 4096), "CL");
+        cl->base = vc5_bo_map(cl->bo);
+        cl->size = cl->bo->size;
+        cl->next = cl->base;
+
+        return 0;
+}
+
+void
+vc5_cl_ensure_space_with_branch(struct vc5_cl *cl, uint32_t space)
+{
+        if (cl_offset(cl) + space + cl_packet_length(BRANCH) <= cl->size)
+                return;
+
+        struct vc5_bo *new_bo = vc5_bo_alloc(cl->job->vc5->screen, 4096, "CL");
+        assert(space <= new_bo->size);
+
+        /* Chain to the new BO from the old one. */
+        if (cl->bo) {
+                cl_emit(cl, BRANCH, branch) {
+                        branch.address = cl_address(new_bo, 0);
+                }
+                vc5_bo_unreference(&cl->bo);
+        } else {
+                /* Root the first RCL/BCL BO in the job. */
+                vc5_job_add_bo(cl->job, cl->bo);
+        }
+
+        cl->bo = new_bo;
+        cl->base = vc5_bo_map(cl->bo);
+        cl->size = cl->bo->size;
+        cl->next = cl->base;
+}
+
+void
+vc5_destroy_cl(struct vc5_cl *cl)
+{
+        vc5_bo_unreference(&cl->bo);
+}
diff --git a/src/gallium/drivers/v3d/v3d_cl.h b/src/gallium/drivers/v3d/v3d_cl.h
new file mode 100644
index 00000000000..7025b5a672b
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_cl.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_CL_H
+#define VC5_CL_H
+
+#include <stdint.h>
+
+#include "util/u_math.h"
+#include "util/macros.h"
+
+struct vc5_bo;
+struct vc5_job;
+struct vc5_cl;
+
+/**
+ * Undefined structure, used for typechecking that you're passing the pointers
+ * to these functions correctly.
+ */
+struct vc5_cl_out;
+
+/** A reference to a BO used in the CL packing functions */
+struct vc5_cl_reloc {
+        struct vc5_bo *bo;
+        uint32_t offset;
+};
+
+static inline void cl_pack_emit_reloc(struct vc5_cl *cl, const struct vc5_cl_reloc *);
+
+#define __gen_user_data struct vc5_cl
+#define __gen_address_type struct vc5_cl_reloc
+#define __gen_address_offset(reloc) (((reloc)->bo ? (reloc)->bo->offset : 0) + \
+                                     (reloc)->offset)
+#define __gen_emit_reloc cl_pack_emit_reloc
+
+struct vc5_cl {
+        void *base;
+        struct vc5_job *job;
+        struct vc5_cl_out *next;
+        struct vc5_bo *bo;
+        uint32_t size;
+};
+
+void vc5_init_cl(struct vc5_job *job, struct vc5_cl *cl);
+void vc5_destroy_cl(struct vc5_cl *cl);
+void vc5_dump_cl(void *cl, uint32_t size, bool is_render);
+uint32_t vc5_gem_hindex(struct vc5_job *job, struct vc5_bo *bo);
+
+struct PACKED unaligned_16 { uint16_t x; };
+struct PACKED unaligned_32 { uint32_t x; };
+
+static inline uint32_t cl_offset(struct vc5_cl *cl)
+{
+        return (char *)cl->next - (char *)cl->base;
+}
+
+static inline struct vc5_cl_reloc cl_get_address(struct vc5_cl *cl)
+{
+        return (struct vc5_cl_reloc){ .bo = cl->bo, .offset = cl_offset(cl) };
+}
+
+static inline void
+cl_advance(struct vc5_cl_out **cl, uint32_t n)
+{
+        (*cl) = (struct vc5_cl_out *)((char *)(*cl) + n);
+}
+
+static inline struct vc5_cl_out *
+cl_start(struct vc5_cl *cl)
+{
+        return cl->next;
+}
+
+static inline void
+cl_end(struct vc5_cl *cl, struct vc5_cl_out *next)
+{
+        cl->next = next;
+        assert(cl_offset(cl) <= cl->size);
+}
+
+
+static inline void
+put_unaligned_32(struct vc5_cl_out *ptr, uint32_t val)
+{
+        struct unaligned_32 *p = (void *)ptr;
+        p->x = val;
+}
+
+static inline void
+put_unaligned_16(struct vc5_cl_out *ptr, uint16_t val)
+{
+        struct unaligned_16 *p = (void *)ptr;
+        p->x = val;
+}
+
+static inline void
+cl_u8(struct vc5_cl_out **cl, uint8_t n)
+{
+        *(uint8_t *)(*cl) = n;
+        cl_advance(cl, 1);
+}
+
+static inline void
+cl_u16(struct vc5_cl_out **cl, uint16_t n)
+{
+        put_unaligned_16(*cl, n);
+        cl_advance(cl, 2);
+}
+
+static inline void
+cl_u32(struct vc5_cl_out **cl, uint32_t n)
+{
+        put_unaligned_32(*cl, n);
+        cl_advance(cl, 4);
+}
+
+static inline void
+cl_aligned_u32(struct vc5_cl_out **cl, uint32_t n)
+{
+        *(uint32_t *)(*cl) = n;
+        cl_advance(cl, 4);
+}
+
+static inline void
+cl_aligned_reloc(struct vc5_cl *cl,
+                 struct vc5_cl_out **cl_out,
+                 struct vc5_bo *bo, uint32_t offset)
+{
+        cl_aligned_u32(cl_out, bo->offset + offset);
+        vc5_job_add_bo(cl->job, bo);
+}
+
+static inline void
+cl_ptr(struct vc5_cl_out **cl, void *ptr)
+{
+        *(struct vc5_cl_out **)(*cl) = ptr;
+        cl_advance(cl, sizeof(void *));
+}
+
+static inline void
+cl_f(struct vc5_cl_out **cl, float f)
+{
+        cl_u32(cl, fui(f));
+}
+
+static inline void
+cl_aligned_f(struct vc5_cl_out **cl, float f)
+{
+        cl_aligned_u32(cl, fui(f));
+}
+
+/**
+ * Reference to a BO with its associated offset, used in the pack process.
+ */
+static inline struct vc5_cl_reloc
+cl_address(struct vc5_bo *bo, uint32_t offset)
+{
+        struct vc5_cl_reloc reloc = {
+                .bo = bo,
+                .offset = offset,
+        };
+        return reloc;
+}
+
+uint32_t vc5_cl_ensure_space(struct vc5_cl *cl, uint32_t size, uint32_t align);
+void vc5_cl_ensure_space_with_branch(struct vc5_cl *cl, uint32_t size);
+
+#define cl_packet_header(packet) V3DX(packet ## _header)
+#define cl_packet_length(packet) V3DX(packet ## _length)
+#define cl_packet_pack(packet)   V3DX(packet ## _pack)
+#define cl_packet_struct(packet) V3DX(packet)
+
+static inline void *
+cl_get_emit_space(struct vc5_cl_out **cl, size_t size)
+{
+        void *addr = *cl;
+        cl_advance(cl, size);
+        return addr;
+}
+
+/* Macro for setting up an emit of a CL struct.  A temporary unpacked struct
+ * is created, which you get to set fields in of the form:
+ *
+ * cl_emit(bcl, FLAT_SHADE_FLAGS, flags) {
+ *     .flags.flat_shade_flags = 1 << 2,
+ * }
+ *
+ * or default values only can be emitted with just:
+ *
+ * cl_emit(bcl, FLAT_SHADE_FLAGS, flags);
+ *
+ * The trick here is that we make a for loop that will execute the body
+ * (either the block or the ';' after the macro invocation) exactly once.
+ */
+#define cl_emit(cl, packet, name)                                \
+        for (struct cl_packet_struct(packet) name = {            \
+                cl_packet_header(packet)                         \
+        },                                                       \
+        *_loop_terminate = &name;                                \
+        __builtin_expect(_loop_terminate != NULL, 1);            \
+        ({                                                       \
+                struct vc5_cl_out *cl_out = cl_start(cl);        \
+                cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
+                cl_advance(&cl_out, cl_packet_length(packet));   \
+                cl_end(cl, cl_out);                              \
+                _loop_terminate = NULL;                          \
+        }))                                                      \
+
+#define cl_emit_with_prepacked(cl, packet, prepacked, name)      \
+        for (struct cl_packet_struct(packet) name = {            \
+                cl_packet_header(packet)                         \
+        },                                                       \
+        *_loop_terminate = &name;                                \
+        __builtin_expect(_loop_terminate != NULL, 1);            \
+        ({                                                       \
+                struct vc5_cl_out *cl_out = cl_start(cl);        \
+                uint8_t packed[cl_packet_length(packet)];         \
+                cl_packet_pack(packet)(cl, packed, &name);       \
+                for (int _i = 0; _i < cl_packet_length(packet); _i++) \
+                        ((uint8_t *)cl_out)[_i] = packed[_i] | (prepacked)[_i]; \
+                cl_advance(&cl_out, cl_packet_length(packet));   \
+                cl_end(cl, cl_out);                              \
+                _loop_terminate = NULL;                          \
+        }))                                                      \
+
+#define cl_emit_prepacked(cl, packet) do {                       \
+        memcpy((cl)->next, packet, sizeof(*packet));             \
+        cl_advance(&(cl)->next, sizeof(*packet));                \
+} while (0)
+
+#define v3dx_pack(packed, packet, name)                          \
+        for (struct cl_packet_struct(packet) name = {            \
+                cl_packet_header(packet)                         \
+        },                                                       \
+        *_loop_terminate = &name;                                \
+        __builtin_expect(_loop_terminate != NULL, 1);            \
+        ({                                                       \
+                cl_packet_pack(packet)(NULL, (uint8_t *)packed, &name); \
+                VG(VALGRIND_CHECK_MEM_IS_DEFINED((uint8_t *)packed, \
+                                                 cl_packet_length(packet))); \
+                _loop_terminate = NULL;                          \
+        }))                                                      \
+
+/**
+ * Helper function called by the XML-generated pack functions for filling in
+ * an address field in shader records.
+ *
+ * Since we have a private address space as of VC5, our BOs can have lifelong
+ * offsets, and all the kernel needs to know is which BOs need to be paged in
+ * for this exec.
+ */
+static inline void
+cl_pack_emit_reloc(struct vc5_cl *cl, const struct vc5_cl_reloc *reloc)
+{
+        if (reloc->bo)
+                vc5_job_add_bo(cl->job, reloc->bo);
+}
+
+#endif /* VC5_CL_H */
diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
new file mode 100644
index 00000000000..cb37eba3841
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xf86drm.h>
+#include <err.h>
+
+#include "pipe/p_defines.h"
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_blitter.h"
+#include "util/u_upload_mgr.h"
+#include "indices/u_primconvert.h"
+#include "pipe/p_screen.h"
+
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+
+void
+vc5_flush(struct pipe_context *pctx)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        struct hash_entry *entry;
+        hash_table_foreach(vc5->jobs, entry) {
+                struct vc5_job *job = entry->data;
+                vc5_job_submit(vc5, job);
+        }
+}
+
+static void
+vc5_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
+               unsigned flags)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        vc5_flush(pctx);
+
+        if (fence) {
+                struct pipe_screen *screen = pctx->screen;
+                struct vc5_fence *f = vc5_fence_create(vc5);
+                screen->fence_reference(screen, fence, NULL);
+                *fence = (struct pipe_fence_handle *)f;
+        }
+}
+
+static void
+vc5_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_resource *rsc = vc5_resource(prsc);
+
+        rsc->initialized_buffers = 0;
+
+        struct hash_entry *entry = _mesa_hash_table_search(vc5->write_jobs,
+                                                           prsc);
+        if (!entry)
+                return;
+
+        struct vc5_job *job = entry->data;
+        if (job->key.zsbuf && job->key.zsbuf->texture == prsc)
+                job->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
+}
+
+static void
+vc5_context_destroy(struct pipe_context *pctx)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        vc5_flush(pctx);
+
+        if (vc5->blitter)
+                util_blitter_destroy(vc5->blitter);
+
+        if (vc5->primconvert)
+                util_primconvert_destroy(vc5->primconvert);
+
+        if (vc5->uploader)
+                u_upload_destroy(vc5->uploader);
+
+        slab_destroy_child(&vc5->transfer_pool);
+
+        pipe_surface_reference(&vc5->framebuffer.cbufs[0], NULL);
+        pipe_surface_reference(&vc5->framebuffer.zsbuf, NULL);
+
+        vc5_program_fini(pctx);
+
+        ralloc_free(vc5);
+}
+
+struct pipe_context *
+vc5_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_context *vc5;
+
+        /* Prevent dumping of the shaders built during context setup. */
+        uint32_t saved_shaderdb_flag = V3D_DEBUG & V3D_DEBUG_SHADERDB;
+        V3D_DEBUG &= ~V3D_DEBUG_SHADERDB;
+
+        vc5 = rzalloc(NULL, struct vc5_context);
+        if (!vc5)
+                return NULL;
+        struct pipe_context *pctx = &vc5->base;
+
+        vc5->screen = screen;
+
+        int ret = drmSyncobjCreate(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                   &vc5->out_sync);
+        if (ret) {
+                ralloc_free(vc5);
+                return NULL;
+        }
+
+        pctx->screen = pscreen;
+        pctx->priv = priv;
+        pctx->destroy = vc5_context_destroy;
+        pctx->flush = vc5_pipe_flush;
+        pctx->invalidate_resource = vc5_invalidate_resource;
+
+        if (screen->devinfo.ver >= 41) {
+                v3d41_draw_init(pctx);
+                v3d41_state_init(pctx);
+        } else {
+                v3d33_draw_init(pctx);
+                v3d33_state_init(pctx);
+        }
+        vc5_program_init(pctx);
+        vc5_query_init(pctx);
+        vc5_resource_context_init(pctx);
+
+        vc5_job_init(vc5);
+
+        vc5->fd = screen->fd;
+
+        slab_create_child(&vc5->transfer_pool, &screen->transfer_pool);
+
+        vc5->uploader = u_upload_create_default(&vc5->base);
+        vc5->base.stream_uploader = vc5->uploader;
+        vc5->base.const_uploader = vc5->uploader;
+
+        vc5->blitter = util_blitter_create(pctx);
+        if (!vc5->blitter)
+                goto fail;
+
+        vc5->primconvert = util_primconvert_create(pctx,
+                                                   (1 << PIPE_PRIM_QUADS) - 1);
+        if (!vc5->primconvert)
+                goto fail;
+
+        V3D_DEBUG |= saved_shaderdb_flag;
+
+        vc5->sample_mask = (1 << VC5_MAX_SAMPLES) - 1;
+        vc5->active_queries = true;
+
+        return &vc5->base;
+
+fail:
+        pctx->destroy(pctx);
+        return NULL;
+}
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
new file mode 100644
index 00000000000..7c17eccd47e
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_CONTEXT_H
+#define VC5_CONTEXT_H
+
+#ifdef V3D_VERSION
+#include "broadcom/common/v3d_macros.h"
+#endif
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/bitset.h"
+#include "util/slab.h"
+#include "xf86drm.h"
+#include "v3d_drm.h"
+#include "v3d_screen.h"
+
+struct vc5_job;
+struct vc5_bo;
+void vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo);
+
+#include "v3d_bufmgr.h"
+#include "v3d_resource.h"
+#include "v3d_cl.h"
+
+#ifdef USE_V3D_SIMULATOR
+#define using_vc5_simulator true
+#else
+#define using_vc5_simulator false
+#endif
+
+#define VC5_DIRTY_BLEND         (1 <<  0)
+#define VC5_DIRTY_RASTERIZER    (1 <<  1)
+#define VC5_DIRTY_ZSA           (1 <<  2)
+#define VC5_DIRTY_FRAGTEX       (1 <<  3)
+#define VC5_DIRTY_VERTTEX       (1 <<  4)
+
+#define VC5_DIRTY_BLEND_COLOR   (1 <<  7)
+#define VC5_DIRTY_STENCIL_REF   (1 <<  8)
+#define VC5_DIRTY_SAMPLE_MASK   (1 <<  9)
+#define VC5_DIRTY_FRAMEBUFFER   (1 << 10)
+#define VC5_DIRTY_STIPPLE       (1 << 11)
+#define VC5_DIRTY_VIEWPORT      (1 << 12)
+#define VC5_DIRTY_CONSTBUF      (1 << 13)
+#define VC5_DIRTY_VTXSTATE      (1 << 14)
+#define VC5_DIRTY_VTXBUF        (1 << 15)
+#define VC5_DIRTY_SCISSOR       (1 << 17)
+#define VC5_DIRTY_FLAT_SHADE_FLAGS (1 << 18)
+#define VC5_DIRTY_PRIM_MODE     (1 << 19)
+#define VC5_DIRTY_CLIP          (1 << 20)
+#define VC5_DIRTY_UNCOMPILED_VS (1 << 21)
+#define VC5_DIRTY_UNCOMPILED_FS (1 << 22)
+#define VC5_DIRTY_COMPILED_CS   (1 << 23)
+#define VC5_DIRTY_COMPILED_VS   (1 << 24)
+#define VC5_DIRTY_COMPILED_FS   (1 << 25)
+#define VC5_DIRTY_FS_INPUTS     (1 << 26)
+#define VC5_DIRTY_STREAMOUT     (1 << 27)
+#define VC5_DIRTY_OQ            (1 << 28)
+#define VC5_DIRTY_CENTROID_FLAGS (1 << 29)
+
+#define VC5_MAX_FS_INPUTS 64
+
+struct vc5_sampler_view {
+        struct pipe_sampler_view base;
+        uint32_t p0;
+        uint32_t p1;
+        /* Precomputed swizzles to pass in to the shader key. */
+        uint8_t swizzle[4];
+
+        uint8_t texture_shader_state[32];
+        /* V3D 4.x: Texture state struct. */
+        struct vc5_bo *bo;
+};
+
+struct vc5_sampler_state {
+        struct pipe_sampler_state base;
+        uint32_t p0;
+        uint32_t p1;
+
+        /* V3D 3.x: Packed texture state. */
+        uint8_t texture_shader_state[32];
+        /* V3D 4.x: Sampler state struct. */
+        struct vc5_bo *bo;
+};
+
+struct vc5_texture_stateobj {
+        struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
+        unsigned num_textures;
+        struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+        unsigned num_samplers;
+        struct vc5_cl_reloc texture_state[PIPE_MAX_SAMPLERS];
+};
+
+struct vc5_shader_uniform_info {
+        enum quniform_contents *contents;
+        uint32_t *data;
+        uint32_t count;
+};
+
+struct vc5_uncompiled_shader {
+        /** A name for this program, so you can track it in shader-db output. */
+        uint32_t program_id;
+        /** How many variants of this program were compiled, for shader-db. */
+        uint32_t compiled_variant_count;
+        struct pipe_shader_state base;
+        uint32_t num_tf_outputs;
+        struct v3d_varying_slot *tf_outputs;
+        uint16_t tf_specs[16];
+        uint16_t tf_specs_psiz[16];
+        uint32_t num_tf_specs;
+
+        /**
+         * Flag for if the NIR in this shader originally came from TGSI.  If
+         * so, we need to do some fixups at compile time, due to missing
+         * information in TGSI that exists in NIR.
+         */
+        bool was_tgsi;
+};
+
+struct vc5_compiled_shader {
+        struct vc5_bo *bo;
+
+        union {
+                struct v3d_prog_data *base;
+                struct v3d_vs_prog_data *vs;
+                struct v3d_fs_prog_data *fs;
+        } prog_data;
+
+        /**
+         * VC5_DIRTY_* flags that, when set in vc5->dirty, mean that the
+         * uniforms have to be rewritten (and therefore the shader state
+         * reemitted).
+         */
+        uint32_t uniform_dirty_bits;
+};
+
+struct vc5_program_stateobj {
+        struct vc5_uncompiled_shader *bind_vs, *bind_fs;
+        struct vc5_compiled_shader *cs, *vs, *fs;
+
+        struct vc5_bo *spill_bo;
+        int spill_size_per_thread;
+};
+
+struct vc5_constbuf_stateobj {
+        struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
+        uint32_t enabled_mask;
+        uint32_t dirty_mask;
+};
+
+struct vc5_vertexbuf_stateobj {
+        struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
+        unsigned count;
+        uint32_t enabled_mask;
+        uint32_t dirty_mask;
+};
+
+struct vc5_vertex_stateobj {
+        struct pipe_vertex_element pipe[VC5_MAX_ATTRIBUTES];
+        unsigned num_elements;
+
+        uint8_t attrs[12 * VC5_MAX_ATTRIBUTES];
+        struct vc5_bo *default_attribute_values;
+};
+
+struct vc5_streamout_stateobj {
+        struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+        unsigned num_targets;
+};
+
+/* Hash table key for vc5->jobs */
+struct vc5_job_key {
+        struct pipe_surface *cbufs[4];
+        struct pipe_surface *zsbuf;
+};
+
+enum vc5_ez_state {
+        VC5_EZ_UNDECIDED = 0,
+        VC5_EZ_GT_GE,
+        VC5_EZ_LT_LE,
+        VC5_EZ_DISABLED,
+};
+
+/**
+ * A complete bin/render job.
+ *
+ * This is all of the state necessary to submit a bin/render to the kernel.
+ * We want to be able to have multiple in progress at a time, so that we don't
+ * need to flush an existing CL just to switch to rendering to a new render
+ * target (which would mean reading back from the old render target when
+ * starting to render to it again).
+ */
+struct vc5_job {
+        struct vc5_context *vc5;
+        struct vc5_cl bcl;
+        struct vc5_cl rcl;
+        struct vc5_cl indirect;
+        struct vc5_bo *tile_alloc;
+        struct vc5_bo *tile_state;
+        uint32_t shader_rec_count;
+
+        struct drm_v3d_submit_cl submit;
+
+        /**
+         * Set of all BOs referenced by the job.  This will be used for making
+         * the list of BOs that the kernel will need to have paged in to
+         * execute our job.
+         */
+        struct set *bos;
+
+        /** Sum of the sizes of the BOs referenced by the job. */
+        uint32_t referenced_size;
+
+        struct set *write_prscs;
+
+        /* Size of the submit.bo_handles array. */
+        uint32_t bo_handles_size;
+
+        /** @{ Surfaces to submit rendering for. */
+        struct pipe_surface *cbufs[4];
+        struct pipe_surface *zsbuf;
+        /** @} */
+        /** @{
+         * Bounding box of the scissor across all queued drawing.
+         *
+         * Note that the max values are exclusive.
+         */
+        uint32_t draw_min_x;
+        uint32_t draw_min_y;
+        uint32_t draw_max_x;
+        uint32_t draw_max_y;
+        /** @} */
+        /** @{
+         * Width/height of the color framebuffer being rendered to,
+         * for VC5_TILE_RENDERING_MODE_CONFIG.
+        */
+        uint32_t draw_width;
+        uint32_t draw_height;
+        /** @} */
+        /** @{ Tile information, depending on MSAA and float color buffer. */
+        uint32_t draw_tiles_x; /** @< Number of tiles wide for framebuffer. */
+        uint32_t draw_tiles_y; /** @< Number of tiles high for framebuffer. */
+
+        uint32_t tile_width; /** @< Width of a tile. */
+        uint32_t tile_height; /** @< Height of a tile. */
+        /** maximum internal_bpp of all color render targets. */
+        uint32_t internal_bpp;
+
+        /** Whether the current rendering is in a 4X MSAA tile buffer. */
+        bool msaa;
+        /** @} */
+
+        /* Bitmask of PIPE_CLEAR_* of buffers that were cleared before the
+         * first rendering.
+         */
+        uint32_t cleared;
+        /* Bitmask of PIPE_CLEAR_* of buffers that have been rendered to
+         * (either clears or draws).
+         */
+        uint32_t resolve;
+        uint32_t clear_color[4][4];
+        float clear_z;
+        uint8_t clear_s;
+
+        /**
+         * Set if some drawing (triangles, blits, or just a glClear()) has
+         * been done to the FBO, meaning that we need to
+         * DRM_IOCTL_VC5_SUBMIT_CL.
+         */
+        bool needs_flush;
+
+        /**
+         * Set if there is a nonzero address for OCCLUSION_QUERY_COUNTER.  If
+         * so, we need to disable it and flush before ending the CL, to keep
+         * the next tile from starting with it enabled.
+         */
+        bool oq_enabled;
+
+        /**
+         * Set when a packet enabling TF on all further primitives has been
+         * emitted.
+         */
+        bool tf_enabled;
+
+        /**
+         * Current EZ state for drawing. Updated at the start of draw after
+         * we've decided on the shader being rendered.
+         */
+        enum vc5_ez_state ez_state;
+        /**
+         * The first EZ state that was used for drawing with a decided EZ
+         * direction (so either UNDECIDED, GT, or LT).
+         */
+        enum vc5_ez_state first_ez_state;
+
+        /**
+         * Number of draw calls (not counting full buffer clears) queued in
+         * the current job.
+         */
+        uint32_t draw_calls_queued;
+
+        struct vc5_job_key key;
+};
+
+struct vc5_context {
+        struct pipe_context base;
+
+        int fd;
+        struct vc5_screen *screen;
+
+        /** The 3D rendering job for the currently bound FBO. */
+        struct vc5_job *job;
+
+        /* Map from struct vc5_job_key to the job for that FBO.
+         */
+        struct hash_table *jobs;
+
+        /**
+         * Map from vc5_resource to a job writing to that resource.
+         *
+         * Primarily for flushing jobs rendering to textures that are now
+         * being read from.
+         */
+        struct hash_table *write_jobs;
+
+        struct slab_child_pool transfer_pool;
+        struct blitter_context *blitter;
+
+        /** bitfield of VC5_DIRTY_* */
+        uint32_t dirty;
+
+        struct primconvert_context *primconvert;
+
+        struct hash_table *fs_cache, *vs_cache;
+        uint32_t next_uncompiled_program_id;
+        uint64_t next_compiled_program_id;
+
+        struct vc5_compiler_state *compiler_state;
+
+        uint8_t prim_mode;
+
+        /** Maximum index buffer valid for the current shader_rec. */
+        uint32_t max_index;
+
+        /** Sync object that our RCL will update as its out_sync. */
+        uint32_t out_sync;
+
+        struct u_upload_mgr *uploader;
+
+        /** @{ Current pipeline state objects */
+        struct pipe_scissor_state scissor;
+        struct pipe_blend_state *blend;
+        struct vc5_rasterizer_state *rasterizer;
+        struct vc5_depth_stencil_alpha_state *zsa;
+
+        struct vc5_texture_stateobj verttex, fragtex;
+
+        struct vc5_program_stateobj prog;
+
+        struct vc5_vertex_stateobj *vtx;
+
+        struct {
+                struct pipe_blend_color f;
+                uint16_t hf[4];
+        } blend_color;
+        struct pipe_stencil_ref stencil_ref;
+        unsigned sample_mask;
+        struct pipe_framebuffer_state framebuffer;
+
+        /* Per render target, whether we should swap the R and B fields in the
+         * shader's color output and in blending.  If render targets disagree
+         * on the R/B swap and use the constant color, then we would need to
+         * fall back to in-shader blending.
+         */
+        uint8_t swap_color_rb;
+
+        /* Per render target, whether we should treat the dst alpha values as
+         * one in blending.
+         *
+         * For RGBX formats, the tile buffer's alpha channel will be
+         * undefined.
+         */
+        uint8_t blend_dst_alpha_one;
+
+        bool active_queries;
+
+        uint32_t tf_prims_generated;
+        uint32_t prims_generated;
+
+        struct pipe_poly_stipple stipple;
+        struct pipe_clip_state clip;
+        struct pipe_viewport_state viewport;
+        struct vc5_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
+        struct vc5_vertexbuf_stateobj vertexbuf;
+        struct vc5_streamout_stateobj streamout;
+        struct vc5_bo *current_oq;
+        /** @} */
+};
+
+struct vc5_rasterizer_state {
+        struct pipe_rasterizer_state base;
+
+        /* VC5_CONFIGURATION_BITS */
+        uint8_t config_bits[3];
+
+        float point_size;
+
+        /**
+         * Half-float (1/8/7 bits) value of polygon offset units for
+         * VC5_PACKET_DEPTH_OFFSET
+         */
+        uint16_t offset_units;
+        /**
+         * Half-float (1/8/7 bits) value of polygon offset scale for
+         * VC5_PACKET_DEPTH_OFFSET
+         */
+        uint16_t offset_factor;
+};
+
+struct vc5_depth_stencil_alpha_state {
+        struct pipe_depth_stencil_alpha_state base;
+
+        enum vc5_ez_state ez_state;
+
+        /** Uniforms for stencil state.
+         *
+         * Index 0 is either the front config, or the front-and-back config.
+         * Index 1 is the back config if doing separate back stencil.
+         * Index 2 is the writemask config if it's not a common mask value.
+         */
+        uint32_t stencil_uniforms[3];
+
+        uint8_t stencil_front[6];
+        uint8_t stencil_back[6];
+};
+
+#define perf_debug(...) do {                            \
+        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))       \
+                fprintf(stderr, __VA_ARGS__);           \
+} while (0)
+
+static inline struct vc5_context *
+vc5_context(struct pipe_context *pcontext)
+{
+        return (struct vc5_context *)pcontext;
+}
+
+static inline struct vc5_sampler_view *
+vc5_sampler_view(struct pipe_sampler_view *psview)
+{
+        return (struct vc5_sampler_view *)psview;
+}
+
+static inline struct vc5_sampler_state *
+vc5_sampler_state(struct pipe_sampler_state *psampler)
+{
+        return (struct vc5_sampler_state *)psampler;
+}
+
+struct pipe_context *vc5_context_create(struct pipe_screen *pscreen,
+                                        void *priv, unsigned flags);
+void vc5_program_init(struct pipe_context *pctx);
+void vc5_program_fini(struct pipe_context *pctx);
+void vc5_query_init(struct pipe_context *pctx);
+
+void vc5_simulator_init(struct vc5_screen *screen);
+void vc5_simulator_destroy(struct vc5_screen *screen);
+int vc5_simulator_flush(struct vc5_context *vc5,
+                        struct drm_v3d_submit_cl *args,
+                        struct vc5_job *job);
+int vc5_simulator_ioctl(int fd, unsigned long request, void *arg);
+void vc5_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+                                    int handle, uint32_t size);
+
+static inline int
+vc5_ioctl(int fd, unsigned long request, void *arg)
+{
+        if (using_vc5_simulator)
+                return vc5_simulator_ioctl(fd, request, arg);
+        else
+                return drmIoctl(fd, request, arg);
+}
+
+void vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader);
+struct vc5_cl_reloc vc5_write_uniforms(struct vc5_context *vc5,
+                                       struct vc5_compiled_shader *shader,
+                                       struct vc5_constbuf_stateobj *cb,
+                                       struct vc5_texture_stateobj *texstate);
+
+void vc5_flush(struct pipe_context *pctx);
+void vc5_job_init(struct vc5_context *vc5);
+struct vc5_job *vc5_get_job(struct vc5_context *vc5,
+                            struct pipe_surface **cbufs,
+                            struct pipe_surface *zsbuf);
+struct vc5_job *vc5_get_job_for_fbo(struct vc5_context *vc5);
+void vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo);
+void vc5_job_add_write_resource(struct vc5_job *job, struct pipe_resource *prsc);
+void vc5_job_submit(struct vc5_context *vc5, struct vc5_job *job);
+void vc5_flush_jobs_writing_resource(struct vc5_context *vc5,
+                                     struct pipe_resource *prsc);
+void vc5_flush_jobs_reading_resource(struct vc5_context *vc5,
+                                     struct pipe_resource *prsc);
+void vc5_update_compiled_shaders(struct vc5_context *vc5, uint8_t prim_mode);
+
+bool vc5_rt_format_supported(const struct v3d_device_info *devinfo,
+                             enum pipe_format f);
+bool vc5_tex_format_supported(const struct v3d_device_info *devinfo,
+                              enum pipe_format f);
+uint8_t vc5_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f);
+uint8_t vc5_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f);
+uint8_t vc5_get_tex_return_size(const struct v3d_device_info *devinfo,
+                                enum pipe_format f,
+                                enum pipe_tex_compare compare);
+uint8_t vc5_get_tex_return_channels(const struct v3d_device_info *devinfo,
+                                    enum pipe_format f);
+const uint8_t *vc5_get_format_swizzle(const struct v3d_device_info *devinfo,
+                                      enum pipe_format f);
+void vc5_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
+                                                 uint32_t format,
+                                                 uint32_t *type,
+                                                 uint32_t *bpp);
+
+void vc5_init_query_functions(struct vc5_context *vc5);
+void vc5_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info);
+void vc5_blitter_save(struct vc5_context *vc5);
+
+struct vc5_fence *vc5_fence_create(struct vc5_context *vc5);
+
+#ifdef v3dX
+#  include "v3dx_context.h"
+#else
+#  define v3dX(x) v3d33_##x
+#  include "v3dx_context.h"
+#  undef v3dX
+
+#  define v3dX(x) v3d41_##x
+#  include "v3dx_context.h"
+#  undef v3dX
+#endif
+
+#endif /* VC5_CONTEXT_H */
diff --git a/src/gallium/drivers/v3d/v3d_fence.c b/src/gallium/drivers/v3d/v3d_fence.c
new file mode 100644
index 00000000000..54bce562403
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_fence.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file vc5_fence.c
+ *
+ * Seqno-based fence management.
+ *
+ * We have two mechanisms for waiting in our kernel API: You can wait on a BO
+ * to have all rendering to from any process to be completed, or wait on a
+ * seqno for that particular seqno to be passed.  The fence API we're
+ * implementing is based on waiting for all rendering in the context to have
+ * completed (with no reference to what other processes might be doing with
+ * the same BOs), so we can just use the seqno of the last rendering we'd
+ * fired off as our fence marker.
+ */
+
+#include "util/u_inlines.h"
+
+#include "v3d_context.h"
+#include "v3d_bufmgr.h"
+
+struct vc5_fence {
+        struct pipe_reference reference;
+        uint32_t sync;
+};
+
+static void
+vc5_fence_reference(struct pipe_screen *pscreen,
+                    struct pipe_fence_handle **pp,
+                    struct pipe_fence_handle *pf)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_fence **p = (struct vc5_fence **)pp;
+        struct vc5_fence *f = (struct vc5_fence *)pf;
+        struct vc5_fence *old = *p;
+
+        if (pipe_reference(&(*p)->reference, &f->reference)) {
+                drmSyncobjDestroy(screen->fd, old->sync);
+                free(old);
+        }
+        *p = f;
+}
+
+static boolean
+vc5_fence_finish(struct pipe_screen *pscreen,
+		 struct pipe_context *ctx,
+                 struct pipe_fence_handle *pf,
+                 uint64_t timeout_ns)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_fence *f = (struct vc5_fence *)pf;
+
+        return drmSyncobjWait(screen->fd, &f->sync, 1, timeout_ns, 0, NULL);
+}
+
+struct vc5_fence *
+vc5_fence_create(struct vc5_context *vc5)
+{
+        struct vc5_fence *f = calloc(1, sizeof(*f));
+        if (!f)
+                return NULL;
+
+        uint32_t new_sync;
+        /* Make a new sync object for the context. */
+        int ret = drmSyncobjCreate(vc5->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                   &new_sync);
+        if (ret) {
+                free(f);
+                return NULL;
+        }
+
+        pipe_reference_init(&f->reference, 1);
+        f->sync = vc5->out_sync;
+        vc5->out_sync = new_sync;
+
+        return f;
+}
+
+void
+vc5_fence_init(struct vc5_screen *screen)
+{
+        screen->base.fence_reference = vc5_fence_reference;
+        screen->base.fence_finish = vc5_fence_finish;
+}
diff --git a/src/gallium/drivers/v3d/v3d_format_table.h b/src/gallium/drivers/v3d/v3d_format_table.h
new file mode 100644
index 00000000000..8b8011351a1
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_format_table.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2014-2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define V3D_OUTPUT_IMAGE_FORMAT_NO 255
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct vc5_format {
+        /** Set if the pipe format is defined in the table. */
+        bool present;
+
+        /** One of V3D33_OUTPUT_IMAGE_FORMAT_*, or OUTPUT_IMAGE_FORMAT_NO */
+        uint8_t rt_type;
+
+        /** One of V3D33_TEXTURE_DATA_FORMAT_*. */
+        uint8_t tex_type;
+
+        /**
+         * Swizzle to apply to the RGBA shader output for storing to the tile
+         * buffer, to the RGBA tile buffer to produce shader input (for
+         * blending), and for turning the rgba8888 texture sampler return
+         * value into shader rgba values.
+         */
+        uint8_t swizzle[4];
+
+        /* Whether the return value is 16F/I/UI or 32F/I/UI. */
+        uint8_t return_size;
+
+        /* If return_size == 32, how many channels are returned by texturing.
+         * 16 always returns 2 pairs of 16 bit values.
+         */
+        uint8_t return_channels;
+};
diff --git a/src/gallium/drivers/v3d/v3d_formats.c b/src/gallium/drivers/v3d/v3d_formats.c
new file mode 100644
index 00000000000..8424b368cf4
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_formats.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc5_formats.c
+ *
+ * Contains the table and accessors for VC5 texture and render target format
+ * support.
+ *
+ * The hardware has limited support for texture formats, and extremely limited
+ * support for render target formats.  As a result, we emulate other formats
+ * in our shader code, and this stores the table for doing so.
+ */
+
+#include "util/macros.h"
+
+#include "v3d_context.h"
+#include "v3d_format_table.h"
+
+static const struct vc5_format *
+get_format(const struct v3d_device_info *devinfo, enum pipe_format f)
+{
+        if (devinfo->ver >= 41)
+                return v3d41_get_format_desc(f);
+        else
+                return v3d33_get_format_desc(f);
+}
+
+bool
+vc5_rt_format_supported(const struct v3d_device_info *devinfo,
+                        enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        if (!vf)
+                return false;
+
+        return vf->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO;
+}
+
+uint8_t
+vc5_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        if (!vf)
+                return 0;
+
+        return vf->rt_type;
+}
+
+bool
+vc5_tex_format_supported(const struct v3d_device_info *devinfo,
+                         enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        return vf != NULL;
+}
+
+uint8_t
+vc5_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        if (!vf)
+                return 0;
+
+        return vf->tex_type;
+}
+
+uint8_t
+vc5_get_tex_return_size(const struct v3d_device_info *devinfo,
+                        enum pipe_format f, enum pipe_tex_compare compare)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        if (!vf)
+                return 0;
+
+        if (compare == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+                return 16;
+
+        return vf->return_size;
+}
+
+uint8_t
+vc5_get_tex_return_channels(const struct v3d_device_info *devinfo,
+                            enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+
+        if (!vf)
+                return 0;
+
+        return vf->return_channels;
+}
+
+const uint8_t *
+vc5_get_format_swizzle(const struct v3d_device_info *devinfo, enum pipe_format f)
+{
+        const struct vc5_format *vf = get_format(devinfo, f);
+        static const uint8_t fallback[] = {0, 1, 2, 3};
+
+        if (!vf)
+                return fallback;
+
+        return vf->swizzle;
+}
+
+void
+vc5_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
+                                            uint32_t format,
+                                            uint32_t *type,
+                                            uint32_t *bpp)
+{
+        if (devinfo->ver >= 41) {
+                return v3d41_get_internal_type_bpp_for_output_format(format,
+                                                                     type, bpp);
+        } else {
+                return v3d33_get_internal_type_bpp_for_output_format(format,
+                                                                     type, bpp);
+        }
+}
diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
new file mode 100644
index 00000000000..85c64df34ca
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_job.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file vc5_job.c
+ *
+ * Functions for submitting VC5 render jobs to the kernel.
+ */
+
+#include <xf86drm.h>
+#include "v3d_context.h"
+/* The OQ/semaphore packets are the same across V3D versions. */
+#define V3D_VERSION 33
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/common/v3d_macros.h"
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/set.h"
+#include "broadcom/clif/clif_dump.h"
+
+static void
+remove_from_ht(struct hash_table *ht, void *key)
+{
+        struct hash_entry *entry = _mesa_hash_table_search(ht, key);
+        _mesa_hash_table_remove(ht, entry);
+}
+
+static void
+vc5_job_free(struct vc5_context *vc5, struct vc5_job *job)
+{
+        struct set_entry *entry;
+
+        set_foreach(job->bos, entry) {
+                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
+                vc5_bo_unreference(&bo);
+        }
+
+        remove_from_ht(vc5->jobs, &job->key);
+
+        if (job->write_prscs) {
+                struct set_entry *entry;
+
+                set_foreach(job->write_prscs, entry) {
+                        const struct pipe_resource *prsc = entry->key;
+
+                        remove_from_ht(vc5->write_jobs, (void *)prsc);
+                }
+        }
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                if (job->cbufs[i]) {
+                        remove_from_ht(vc5->write_jobs, job->cbufs[i]->texture);
+                        pipe_surface_reference(&job->cbufs[i], NULL);
+                }
+        }
+        if (job->zsbuf) {
+                remove_from_ht(vc5->write_jobs, job->zsbuf->texture);
+                pipe_surface_reference(&job->zsbuf, NULL);
+        }
+
+        if (vc5->job == job)
+                vc5->job = NULL;
+
+        vc5_destroy_cl(&job->bcl);
+        vc5_destroy_cl(&job->rcl);
+        vc5_destroy_cl(&job->indirect);
+        vc5_bo_unreference(&job->tile_alloc);
+        vc5_bo_unreference(&job->tile_state);
+
+        ralloc_free(job);
+}
+
+static struct vc5_job *
+vc5_job_create(struct vc5_context *vc5)
+{
+        struct vc5_job *job = rzalloc(vc5, struct vc5_job);
+
+        job->vc5 = vc5;
+
+        vc5_init_cl(job, &job->bcl);
+        vc5_init_cl(job, &job->rcl);
+        vc5_init_cl(job, &job->indirect);
+
+        job->draw_min_x = ~0;
+        job->draw_min_y = ~0;
+        job->draw_max_x = 0;
+        job->draw_max_y = 0;
+
+        job->bos = _mesa_set_create(job,
+                                    _mesa_hash_pointer,
+                                    _mesa_key_pointer_equal);
+        return job;
+}
+
+void
+vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo)
+{
+        if (!bo)
+                return;
+
+        if (_mesa_set_search(job->bos, bo))
+                return;
+
+        vc5_bo_reference(bo);
+        _mesa_set_add(job->bos, bo);
+        job->referenced_size += bo->size;
+
+        uint32_t *bo_handles = (void *)(uintptr_t)job->submit.bo_handles;
+
+        if (job->submit.bo_handle_count >= job->bo_handles_size) {
+                job->bo_handles_size = MAX2(4, job->bo_handles_size * 2);
+                bo_handles = reralloc(job, bo_handles,
+                                      uint32_t, job->bo_handles_size);
+                job->submit.bo_handles = (uintptr_t)(void *)bo_handles;
+        }
+        bo_handles[job->submit.bo_handle_count++] = bo->handle;
+}
+
+void
+vc5_job_add_write_resource(struct vc5_job *job, struct pipe_resource *prsc)
+{
+        struct vc5_context *vc5 = job->vc5;
+
+        if (!job->write_prscs) {
+                job->write_prscs = _mesa_set_create(job,
+                                                    _mesa_hash_pointer,
+                                                    _mesa_key_pointer_equal);
+        }
+
+        _mesa_set_add(job->write_prscs, prsc);
+        _mesa_hash_table_insert(vc5->write_jobs, prsc, job);
+}
+
+void
+vc5_flush_jobs_writing_resource(struct vc5_context *vc5,
+                                struct pipe_resource *prsc)
+{
+        struct hash_entry *entry = _mesa_hash_table_search(vc5->write_jobs,
+                                                           prsc);
+        if (entry) {
+                struct vc5_job *job = entry->data;
+                vc5_job_submit(vc5, job);
+        }
+}
+
+void
+vc5_flush_jobs_reading_resource(struct vc5_context *vc5,
+                                struct pipe_resource *prsc)
+{
+        struct vc5_resource *rsc = vc5_resource(prsc);
+
+        vc5_flush_jobs_writing_resource(vc5, prsc);
+
+        struct hash_entry *entry;
+        hash_table_foreach(vc5->jobs, entry) {
+                struct vc5_job *job = entry->data;
+
+                if (_mesa_set_search(job->bos, rsc->bo)) {
+                        vc5_job_submit(vc5, job);
+                        /* Reminder: vc5->jobs is safe to keep iterating even
+                         * after deletion of an entry.
+                         */
+                        continue;
+                }
+        }
+}
+
+static void
+vc5_job_set_tile_buffer_size(struct vc5_job *job)
+{
+        static const uint8_t tile_sizes[] = {
+                64, 64,
+                64, 32,
+                32, 32,
+                32, 16,
+                16, 16,
+        };
+        int tile_size_index = 0;
+        if (job->msaa)
+                tile_size_index += 2;
+
+        if (job->cbufs[3] || job->cbufs[2])
+                tile_size_index += 2;
+        else if (job->cbufs[1])
+                tile_size_index++;
+
+        int max_bpp = RENDER_TARGET_MAXIMUM_32BPP;
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                if (job->cbufs[i]) {
+                        struct vc5_surface *surf = vc5_surface(job->cbufs[i]);
+                        max_bpp = MAX2(max_bpp, surf->internal_bpp);
+                }
+        }
+        job->internal_bpp = max_bpp;
+        STATIC_ASSERT(RENDER_TARGET_MAXIMUM_32BPP == 0);
+        tile_size_index += max_bpp;
+
+        assert(tile_size_index < ARRAY_SIZE(tile_sizes));
+        job->tile_width = tile_sizes[tile_size_index * 2 + 0];
+        job->tile_height = tile_sizes[tile_size_index * 2 + 1];
+}
+
+/**
+ * Returns a vc5_job struture for tracking V3D rendering to a particular FBO.
+ *
+ * If we've already started rendering to this FBO, then return old same job,
+ * otherwise make a new one.  If we're beginning rendering to an FBO, make
+ * sure that any previous reads of the FBO (or writes to its color/Z surfaces)
+ * have been flushed.
+ */
+struct vc5_job *
+vc5_get_job(struct vc5_context *vc5,
+            struct pipe_surface **cbufs, struct pipe_surface *zsbuf)
+{
+        /* Return the existing job for this FBO if we have one */
+        struct vc5_job_key local_key = {
+                .cbufs = {
+                        cbufs[0],
+                        cbufs[1],
+                        cbufs[2],
+                        cbufs[3],
+                },
+                .zsbuf = zsbuf,
+        };
+        struct hash_entry *entry = _mesa_hash_table_search(vc5->jobs,
+                                                           &local_key);
+        if (entry)
+                return entry->data;
+
+        /* Creating a new job.  Make sure that any previous jobs reading or
+         * writing these buffers are flushed.
+         */
+        struct vc5_job *job = vc5_job_create(vc5);
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                if (cbufs[i]) {
+                        vc5_flush_jobs_reading_resource(vc5, cbufs[i]->texture);
+                        pipe_surface_reference(&job->cbufs[i], cbufs[i]);
+
+                        if (cbufs[i]->texture->nr_samples > 1)
+                                job->msaa = true;
+                }
+        }
+        if (zsbuf) {
+                vc5_flush_jobs_reading_resource(vc5, zsbuf->texture);
+                pipe_surface_reference(&job->zsbuf, zsbuf);
+                if (zsbuf->texture->nr_samples > 1)
+                        job->msaa = true;
+        }
+
+        vc5_job_set_tile_buffer_size(job);
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                if (cbufs[i])
+                        _mesa_hash_table_insert(vc5->write_jobs,
+                                                cbufs[i]->texture, job);
+        }
+        if (zsbuf)
+                _mesa_hash_table_insert(vc5->write_jobs, zsbuf->texture, job);
+
+        memcpy(&job->key, &local_key, sizeof(local_key));
+        _mesa_hash_table_insert(vc5->jobs, &job->key, job);
+
+        return job;
+}
+
+struct vc5_job *
+vc5_get_job_for_fbo(struct vc5_context *vc5)
+{
+        if (vc5->job)
+                return vc5->job;
+
+        struct pipe_surface **cbufs = vc5->framebuffer.cbufs;
+        struct pipe_surface *zsbuf = vc5->framebuffer.zsbuf;
+        struct vc5_job *job = vc5_get_job(vc5, cbufs, zsbuf);
+
+        /* The dirty flags are tracking what's been updated while vc5->job has
+         * been bound, so set them all to ~0 when switching between jobs.  We
+         * also need to reset all state at the start of rendering.
+         */
+        vc5->dirty = ~0;
+
+        /* If we're binding to uninitialized buffers, no need to load their
+         * contents before drawing.
+         */
+        for (int i = 0; i < 4; i++) {
+                if (cbufs[i]) {
+                        struct vc5_resource *rsc = vc5_resource(cbufs[i]->texture);
+                        if (!rsc->writes)
+                                job->cleared |= PIPE_CLEAR_COLOR0 << i;
+                }
+        }
+
+        if (zsbuf) {
+                struct vc5_resource *rsc = vc5_resource(zsbuf->texture);
+                if (!rsc->writes)
+                        job->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
+        }
+
+        job->draw_tiles_x = DIV_ROUND_UP(vc5->framebuffer.width,
+                                         job->tile_width);
+        job->draw_tiles_y = DIV_ROUND_UP(vc5->framebuffer.height,
+                                         job->tile_height);
+
+        vc5->job = job;
+
+        return job;
+}
+
+static bool
+vc5_clif_dump_lookup(void *data, uint32_t addr, void **vaddr)
+{
+        struct vc5_job *job = data;
+        struct set_entry *entry;
+
+        set_foreach(job->bos, entry) {
+                struct vc5_bo *bo = (void *)entry->key;
+
+                if (addr >= bo->offset &&
+                    addr < bo->offset + bo->size) {
+                        vc5_bo_map(bo);
+                        *vaddr = bo->map + addr - bo->offset;
+                        return true;
+                }
+        }
+
+        return false;
+}
+
+static void
+vc5_clif_dump(struct vc5_context *vc5, struct vc5_job *job)
+{
+        if (!(V3D_DEBUG & V3D_DEBUG_CL))
+                return;
+
+        struct clif_dump *clif = clif_dump_init(&vc5->screen->devinfo,
+                                                stderr, vc5_clif_dump_lookup,
+                                                job);
+
+        fprintf(stderr, "BCL: 0x%08x..0x%08x\n",
+                job->submit.bcl_start, job->submit.bcl_end);
+
+        clif_dump_add_cl(clif, job->submit.bcl_start, job->submit.bcl_end);
+
+        fprintf(stderr, "RCL: 0x%08x..0x%08x\n",
+                job->submit.rcl_start, job->submit.rcl_end);
+        clif_dump_add_cl(clif, job->submit.rcl_start, job->submit.rcl_end);
+}
+
+/**
+ * Submits the job to the kernel and then reinitializes it.
+ */
+void
+vc5_job_submit(struct vc5_context *vc5, struct vc5_job *job)
+{
+        MAYBE_UNUSED struct vc5_screen *screen = vc5->screen;
+
+        if (!job->needs_flush)
+                goto done;
+
+        if (vc5->screen->devinfo.ver >= 41)
+                v3d41_emit_rcl(job);
+        else
+                v3d33_emit_rcl(job);
+
+        if (cl_offset(&job->bcl) > 0) {
+                if (screen->devinfo.ver >= 41)
+                        v3d41_bcl_epilogue(vc5, job);
+                else
+                        v3d33_bcl_epilogue(vc5, job);
+        }
+
+        job->submit.out_sync = vc5->out_sync;
+        job->submit.bcl_end = job->bcl.bo->offset + cl_offset(&job->bcl);
+        job->submit.rcl_end = job->rcl.bo->offset + cl_offset(&job->rcl);
+
+        /* On V3D 4.1, the tile alloc/state setup moved to register writes
+         * instead of binner packets.
+         */
+        if (screen->devinfo.ver >= 41) {
+                vc5_job_add_bo(job, job->tile_alloc);
+                job->submit.qma = job->tile_alloc->offset;
+                job->submit.qms = job->tile_alloc->size;
+
+                vc5_job_add_bo(job, job->tile_state);
+                job->submit.qts = job->tile_state->offset;
+        }
+
+        vc5_clif_dump(vc5, job);
+
+        if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) {
+                int ret;
+
+#ifndef USE_V3D_SIMULATOR
+                ret = drmIoctl(vc5->fd, DRM_IOCTL_V3D_SUBMIT_CL, &job->submit);
+#else
+                ret = vc5_simulator_flush(vc5, &job->submit, job);
+#endif
+                static bool warned = false;
+                if (ret && !warned) {
+                        fprintf(stderr, "Draw call returned %s.  "
+                                        "Expect corruption.\n", strerror(errno));
+                        warned = true;
+                }
+        }
+
+done:
+        vc5_job_free(vc5, job);
+}
+
+static bool
+vc5_job_compare(const void *a, const void *b)
+{
+        return memcmp(a, b, sizeof(struct vc5_job_key)) == 0;
+}
+
+static uint32_t
+vc5_job_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(struct vc5_job_key));
+}
+
+void
+vc5_job_init(struct vc5_context *vc5)
+{
+        vc5->jobs = _mesa_hash_table_create(vc5,
+                                            vc5_job_hash,
+                                            vc5_job_compare);
+        vc5->write_jobs = _mesa_hash_table_create(vc5,
+                                                  _mesa_hash_pointer,
+                                                  _mesa_key_pointer_equal);
+}
+
diff --git a/src/gallium/drivers/v3d/v3d_program.c b/src/gallium/drivers/v3d/v3d_program.c
new file mode 100644
index 00000000000..ce2e0be8ed2
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -0,0 +1,682 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/ralloc.h"
+#include "util/hash_table.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "nir/tgsi_to_nir.h"
+#include "compiler/v3d_compiler.h"
+#include "v3d_context.h"
+#include "broadcom/cle/v3d_packet_v33_pack.h"
+#include "mesa/state_tracker/st_glsl_types.h"
+
+static gl_varying_slot
+vc5_get_slot_for_driver_location(nir_shader *s, uint32_t driver_location)
+{
+        nir_foreach_variable(var, &s->outputs) {
+                if (var->data.driver_location == driver_location) {
+                        return var->data.location;
+                }
+        }
+
+        return -1;
+}
+
+/**
+ * Precomputes the TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC array for the shader.
+ *
+ * A shader can have 16 of these specs, and each one of them can write up to
+ * 16 dwords.  Since we allow a total of 64 transform feedback output
+ * components (not 16 vectors), we have to group the writes of multiple
+ * varyings together in a single data spec.
+ */
+static void
+vc5_set_transform_feedback_outputs(struct vc5_uncompiled_shader *so,
+                                   const struct pipe_stream_output_info *stream_output)
+{
+        if (!stream_output->num_outputs)
+                return;
+
+        struct v3d_varying_slot slots[PIPE_MAX_SO_OUTPUTS * 4];
+        int slot_count = 0;
+
+        for (int buffer = 0; buffer < PIPE_MAX_SO_BUFFERS; buffer++) {
+                uint32_t buffer_offset = 0;
+                uint32_t vpm_start = slot_count;
+
+                for (int i = 0; i < stream_output->num_outputs; i++) {
+                        const struct pipe_stream_output *output =
+                                &stream_output->output[i];
+
+                        if (output->output_buffer != buffer)
+                                continue;
+
+                        /* We assume that the SO outputs appear in increasing
+                         * order in the buffer.
+                         */
+                        assert(output->dst_offset >= buffer_offset);
+
+                        /* Pad any undefined slots in the output */
+                        for (int j = buffer_offset; j < output->dst_offset; j++) {
+                                slots[slot_count] =
+                                        v3d_slot_from_slot_and_component(VARYING_SLOT_POS, 0);
+                                slot_count++;
+                                buffer_offset++;
+                        }
+
+                        /* Set the coordinate shader up to output the
+                         * components of this varying.
+                         */
+                        for (int j = 0; j < output->num_components; j++) {
+                                gl_varying_slot slot =
+                                        vc5_get_slot_for_driver_location(so->base.ir.nir, output->register_index);
+
+                                slots[slot_count] =
+                                        v3d_slot_from_slot_and_component(slot,
+                                                                         output->start_component + j);
+                                slot_count++;
+                                buffer_offset++;
+                        }
+                }
+
+                uint32_t vpm_size = slot_count - vpm_start;
+                if (!vpm_size)
+                        continue;
+
+                uint32_t vpm_start_offset = vpm_start + 6;
+
+                while (vpm_size) {
+                        uint32_t write_size = MIN2(vpm_size, 1 << 4);
+
+                        struct V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC unpacked = {
+                                /* We need the offset from the coordinate shader's VPM
+                                 * output block, which has the [X, Y, Z, W, Xs, Ys]
+                                 * values at the start.
+                                 */
+                                .first_shaded_vertex_value_to_output = vpm_start_offset,
+                                .number_of_consecutive_vertex_values_to_output_as_32_bit_values_minus_1 = write_size - 1,
+                                .output_buffer_to_write_to = buffer,
+                        };
+
+                        /* GFXH-1559 */
+                        assert(unpacked.first_shaded_vertex_value_to_output != 8 ||
+                               so->num_tf_specs != 0);
+
+                        assert(so->num_tf_specs != ARRAY_SIZE(so->tf_specs));
+                        V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC_pack(NULL,
+                                                                       (void *)&so->tf_specs[so->num_tf_specs],
+                                                                       &unpacked);
+
+                        /* If point size is being written by the shader, then
+                         * all the VPM start offsets are shifted up by one.
+                         * We won't know that until the variant is compiled,
+                         * though.
+                         */
+                        unpacked.first_shaded_vertex_value_to_output++;
+
+                        /* GFXH-1559 */
+                        assert(unpacked.first_shaded_vertex_value_to_output != 8 ||
+                               so->num_tf_specs != 0);
+
+                        V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC_pack(NULL,
+                                                                       (void *)&so->tf_specs_psiz[so->num_tf_specs],
+                                                                       &unpacked);
+                        so->num_tf_specs++;
+                        vpm_start_offset += write_size;
+                        vpm_size -= write_size;
+                }
+        }
+
+        so->num_tf_outputs = slot_count;
+        so->tf_outputs = ralloc_array(so->base.ir.nir, struct v3d_varying_slot,
+                                      slot_count);
+        memcpy(so->tf_outputs, slots, sizeof(*slots) * slot_count);
+}
+
+static int
+type_size(const struct glsl_type *type)
+{
+        return glsl_count_attribute_slots(type, false);
+}
+
+static int
+uniforms_type_size(const struct glsl_type *type)
+{
+        return st_glsl_storage_type_size(type, false);
+}
+
+static void *
+vc5_shader_state_create(struct pipe_context *pctx,
+                        const struct pipe_shader_state *cso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_uncompiled_shader *so = CALLOC_STRUCT(vc5_uncompiled_shader);
+        if (!so)
+                return NULL;
+
+        so->program_id = vc5->next_uncompiled_program_id++;
+
+        nir_shader *s;
+
+        if (cso->type == PIPE_SHADER_IR_NIR) {
+                /* The backend takes ownership of the NIR shader on state
+                 * creation.
+                 */
+                s = cso->ir.nir;
+
+                NIR_PASS_V(s, nir_lower_io, nir_var_all & ~nir_var_uniform,
+                           type_size,
+                           (nir_lower_io_options)0);
+                NIR_PASS_V(s, nir_lower_io, nir_var_uniform,
+                           uniforms_type_size,
+                           (nir_lower_io_options)0);
+        } else {
+                assert(cso->type == PIPE_SHADER_IR_TGSI);
+
+                if (V3D_DEBUG & V3D_DEBUG_TGSI) {
+                        fprintf(stderr, "prog %d TGSI:\n",
+                                so->program_id);
+                        tgsi_dump(cso->tokens, 0);
+                        fprintf(stderr, "\n");
+                }
+                s = tgsi_to_nir(cso->tokens, &v3d_nir_options);
+
+                so->was_tgsi = true;
+        }
+
+        NIR_PASS_V(s, nir_opt_global_to_local);
+        NIR_PASS_V(s, nir_lower_regs_to_ssa);
+        NIR_PASS_V(s, nir_normalize_cubemap_coords);
+
+        NIR_PASS_V(s, nir_lower_load_const_to_scalar);
+
+        v3d_optimize_nir(s);
+
+        NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local);
+
+        /* Garbage collect dead instructions */
+        nir_sweep(s);
+
+        so->base.type = PIPE_SHADER_IR_NIR;
+        so->base.ir.nir = s;
+
+        vc5_set_transform_feedback_outputs(so, &cso->stream_output);
+
+        if (V3D_DEBUG & (V3D_DEBUG_NIR |
+                         v3d_debug_flag_for_shader_stage(s->info.stage))) {
+                fprintf(stderr, "%s prog %d NIR:\n",
+                        gl_shader_stage_name(s->info.stage),
+                        so->program_id);
+                nir_print_shader(s, stderr);
+                fprintf(stderr, "\n");
+        }
+
+        return so;
+}
+
+static struct vc5_compiled_shader *
+vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key)
+{
+        struct vc5_uncompiled_shader *shader_state = key->shader_state;
+        nir_shader *s = shader_state->base.ir.nir;
+
+        struct hash_table *ht;
+        uint32_t key_size;
+        if (s->info.stage == MESA_SHADER_FRAGMENT) {
+                ht = vc5->fs_cache;
+                key_size = sizeof(struct v3d_fs_key);
+        } else {
+                ht = vc5->vs_cache;
+                key_size = sizeof(struct v3d_vs_key);
+        }
+
+        struct hash_entry *entry = _mesa_hash_table_search(ht, key);
+        if (entry)
+                return entry->data;
+
+        struct vc5_compiled_shader *shader =
+                rzalloc(NULL, struct vc5_compiled_shader);
+
+        int program_id = shader_state->program_id;
+        int variant_id =
+                p_atomic_inc_return(&shader_state->compiled_variant_count);
+        uint64_t *qpu_insts;
+        uint32_t shader_size;
+
+        switch (s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                shader->prog_data.vs = rzalloc(shader, struct v3d_vs_prog_data);
+
+                qpu_insts = v3d_compile_vs(vc5->screen->compiler,
+                                           (struct v3d_vs_key *)key,
+                                           shader->prog_data.vs, s,
+                                           program_id, variant_id,
+                                           &shader_size);
+                break;
+        case MESA_SHADER_FRAGMENT:
+                shader->prog_data.fs = rzalloc(shader, struct v3d_fs_prog_data);
+
+                qpu_insts = v3d_compile_fs(vc5->screen->compiler,
+                                           (struct v3d_fs_key *)key,
+                                           shader->prog_data.fs, s,
+                                           program_id, variant_id,
+                                           &shader_size);
+                break;
+        default:
+                unreachable("bad stage");
+        }
+
+        vc5_set_shader_uniform_dirty_flags(shader);
+
+        shader->bo = vc5_bo_alloc(vc5->screen, shader_size, "shader");
+        vc5_bo_map(shader->bo);
+        memcpy(shader->bo->map, qpu_insts, shader_size);
+
+        free(qpu_insts);
+
+        struct vc5_key *dup_key;
+        dup_key = ralloc_size(shader, key_size);
+        memcpy(dup_key, key, key_size);
+        _mesa_hash_table_insert(ht, dup_key, shader);
+
+        if (shader->prog_data.base->spill_size >
+            vc5->prog.spill_size_per_thread) {
+                /* Max 4 QPUs per slice, 3 slices per core. We only do single
+                 * core so far.  This overallocates memory on smaller cores.
+                 */
+                int total_spill_size =
+                        4 * 3 * shader->prog_data.base->spill_size;
+
+                vc5_bo_unreference(&vc5->prog.spill_bo);
+                vc5->prog.spill_bo = vc5_bo_alloc(vc5->screen,
+                                                  total_spill_size, "spill");
+                vc5->prog.spill_size_per_thread =
+                        shader->prog_data.base->spill_size;
+        }
+
+        return shader;
+}
+
+static void
+vc5_setup_shared_key(struct vc5_context *vc5, struct v3d_key *key,
+                     struct vc5_texture_stateobj *texstate)
+{
+        const struct v3d_device_info *devinfo = &vc5->screen->devinfo;
+
+        for (int i = 0; i < texstate->num_textures; i++) {
+                struct pipe_sampler_view *sampler = texstate->textures[i];
+                struct vc5_sampler_view *vc5_sampler = vc5_sampler_view(sampler);
+                struct pipe_sampler_state *sampler_state =
+                        texstate->samplers[i];
+
+                if (!sampler)
+                        continue;
+
+                key->tex[i].return_size =
+                        vc5_get_tex_return_size(devinfo,
+                                                sampler->format,
+                                                sampler_state->compare_mode);
+
+                /* For 16-bit, we set up the sampler to always return 2
+                 * channels (meaning no recompiles for most statechanges),
+                 * while for 32 we actually scale the returns with channels.
+                 */
+                if (key->tex[i].return_size == 16) {
+                        key->tex[i].return_channels = 2;
+                } else if (devinfo->ver > 40) {
+                        key->tex[i].return_channels = 4;
+                } else {
+                        key->tex[i].return_channels =
+                                vc5_get_tex_return_channels(devinfo,
+                                                            sampler->format);
+                }
+
+                if (key->tex[i].return_size == 32 && devinfo->ver < 40) {
+                        memcpy(key->tex[i].swizzle,
+                               vc5_sampler->swizzle,
+                               sizeof(vc5_sampler->swizzle));
+                } else {
+                        /* For 16-bit returns, we let the sampler state handle
+                         * the swizzle.
+                         */
+                        key->tex[i].swizzle[0] = PIPE_SWIZZLE_X;
+                        key->tex[i].swizzle[1] = PIPE_SWIZZLE_Y;
+                        key->tex[i].swizzle[2] = PIPE_SWIZZLE_Z;
+                        key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
+                }
+
+                if (sampler) {
+                        key->tex[i].compare_mode = sampler_state->compare_mode;
+                        key->tex[i].compare_func = sampler_state->compare_func;
+                        key->tex[i].clamp_s =
+                                sampler_state->wrap_s == PIPE_TEX_WRAP_CLAMP;
+                        key->tex[i].clamp_t =
+                                sampler_state->wrap_t == PIPE_TEX_WRAP_CLAMP;
+                        key->tex[i].clamp_r =
+                                sampler_state->wrap_r == PIPE_TEX_WRAP_CLAMP;
+                }
+        }
+
+        key->ucp_enables = vc5->rasterizer->base.clip_plane_enable;
+}
+
+static void
+vc5_update_compiled_fs(struct vc5_context *vc5, uint8_t prim_mode)
+{
+        struct vc5_job *job = vc5->job;
+        struct v3d_fs_key local_key;
+        struct v3d_fs_key *key = &local_key;
+
+        if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
+                            VC5_DIRTY_BLEND |
+                            VC5_DIRTY_FRAMEBUFFER |
+                            VC5_DIRTY_ZSA |
+                            VC5_DIRTY_RASTERIZER |
+                            VC5_DIRTY_SAMPLE_MASK |
+                            VC5_DIRTY_FRAGTEX |
+                            VC5_DIRTY_UNCOMPILED_FS))) {
+                return;
+        }
+
+        memset(key, 0, sizeof(*key));
+        vc5_setup_shared_key(vc5, &key->base, &vc5->fragtex);
+        key->base.shader_state = vc5->prog.bind_fs;
+        key->is_points = (prim_mode == PIPE_PRIM_POINTS);
+        key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
+                         prim_mode <= PIPE_PRIM_LINE_STRIP);
+        key->clamp_color = vc5->rasterizer->base.clamp_fragment_color;
+        if (vc5->blend->logicop_enable) {
+                key->logicop_func = vc5->blend->logicop_func;
+        } else {
+                key->logicop_func = PIPE_LOGICOP_COPY;
+        }
+        if (job->msaa) {
+                key->msaa = vc5->rasterizer->base.multisample;
+                key->sample_coverage = (vc5->rasterizer->base.multisample &&
+                                        vc5->sample_mask != (1 << VC5_MAX_SAMPLES) - 1);
+                key->sample_alpha_to_coverage = vc5->blend->alpha_to_coverage;
+                key->sample_alpha_to_one = vc5->blend->alpha_to_one;
+        }
+
+        key->depth_enabled = (vc5->zsa->base.depth.enabled ||
+                              vc5->zsa->base.stencil[0].enabled);
+        if (vc5->zsa->base.alpha.enabled) {
+                key->alpha_test = true;
+                key->alpha_test_func = vc5->zsa->base.alpha.func;
+        }
+
+        /* gl_FragColor's propagation to however many bound color buffers
+         * there are means that the buffer count needs to be in the key.
+         */
+        key->nr_cbufs = vc5->framebuffer.nr_cbufs;
+        key->swap_color_rb = vc5->swap_color_rb;
+
+        for (int i = 0; i < key->nr_cbufs; i++) {
+                struct pipe_surface *cbuf = vc5->framebuffer.cbufs[i];
+                if (!cbuf)
+                        continue;
+
+                const struct util_format_description *desc =
+                        util_format_description(cbuf->format);
+
+                if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+                    desc->channel[0].size == 32) {
+                        key->f32_color_rb |= 1 << i;
+                }
+
+                if (vc5->prog.bind_fs->was_tgsi) {
+                        if (util_format_is_pure_uint(cbuf->format))
+                                key->uint_color_rb |= 1 << i;
+                        else if (util_format_is_pure_sint(cbuf->format))
+                                key->int_color_rb |= 1 << i;
+                }
+        }
+
+        if (key->is_points) {
+                key->point_sprite_mask =
+                        vc5->rasterizer->base.sprite_coord_enable;
+                key->point_coord_upper_left =
+                        (vc5->rasterizer->base.sprite_coord_mode ==
+                         PIPE_SPRITE_COORD_UPPER_LEFT);
+        }
+
+        key->light_twoside = vc5->rasterizer->base.light_twoside;
+        key->shade_model_flat = vc5->rasterizer->base.flatshade;
+
+        struct vc5_compiled_shader *old_fs = vc5->prog.fs;
+        vc5->prog.fs = vc5_get_compiled_shader(vc5, &key->base);
+        if (vc5->prog.fs == old_fs)
+                return;
+
+        vc5->dirty |= VC5_DIRTY_COMPILED_FS;
+
+        if (old_fs) {
+                if (vc5->prog.fs->prog_data.fs->flat_shade_flags !=
+                    old_fs->prog_data.fs->flat_shade_flags) {
+                        vc5->dirty |= VC5_DIRTY_FLAT_SHADE_FLAGS;
+                }
+
+                if (vc5->prog.fs->prog_data.fs->centroid_flags !=
+                    old_fs->prog_data.fs->centroid_flags) {
+                        vc5->dirty |= VC5_DIRTY_CENTROID_FLAGS;
+                }
+        }
+
+        if (old_fs && memcmp(vc5->prog.fs->prog_data.fs->input_slots,
+                             old_fs->prog_data.fs->input_slots,
+                             sizeof(vc5->prog.fs->prog_data.fs->input_slots))) {
+                vc5->dirty |= VC5_DIRTY_FS_INPUTS;
+        }
+}
+
+static void
+vc5_update_compiled_vs(struct vc5_context *vc5, uint8_t prim_mode)
+{
+        struct v3d_vs_key local_key;
+        struct v3d_vs_key *key = &local_key;
+
+        if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
+                            VC5_DIRTY_RASTERIZER |
+                            VC5_DIRTY_VERTTEX |
+                            VC5_DIRTY_VTXSTATE |
+                            VC5_DIRTY_UNCOMPILED_VS |
+                            VC5_DIRTY_FS_INPUTS))) {
+                return;
+        }
+
+        memset(key, 0, sizeof(*key));
+        vc5_setup_shared_key(vc5, &key->base, &vc5->verttex);
+        key->base.shader_state = vc5->prog.bind_vs;
+        key->num_fs_inputs = vc5->prog.fs->prog_data.fs->base.num_inputs;
+        STATIC_ASSERT(sizeof(key->fs_inputs) ==
+                      sizeof(vc5->prog.fs->prog_data.fs->input_slots));
+        memcpy(key->fs_inputs, vc5->prog.fs->prog_data.fs->input_slots,
+               sizeof(key->fs_inputs));
+        key->clamp_color = vc5->rasterizer->base.clamp_vertex_color;
+
+        key->per_vertex_point_size =
+                (prim_mode == PIPE_PRIM_POINTS &&
+                 vc5->rasterizer->base.point_size_per_vertex);
+
+        struct vc5_compiled_shader *vs =
+                vc5_get_compiled_shader(vc5, &key->base);
+        if (vs != vc5->prog.vs) {
+                vc5->prog.vs = vs;
+                vc5->dirty |= VC5_DIRTY_COMPILED_VS;
+        }
+
+        key->is_coord = true;
+        /* Coord shaders only output varyings used by transform feedback. */
+        struct vc5_uncompiled_shader *shader_state = key->base.shader_state;
+        memcpy(key->fs_inputs, shader_state->tf_outputs,
+               sizeof(*key->fs_inputs) * shader_state->num_tf_outputs);
+        if (shader_state->num_tf_outputs < key->num_fs_inputs) {
+                memset(&key->fs_inputs[shader_state->num_tf_outputs],
+                       0,
+                       sizeof(*key->fs_inputs) * (key->num_fs_inputs -
+                                                  shader_state->num_tf_outputs));
+        }
+        key->num_fs_inputs = shader_state->num_tf_outputs;
+
+        struct vc5_compiled_shader *cs =
+                vc5_get_compiled_shader(vc5, &key->base);
+        if (cs != vc5->prog.cs) {
+                vc5->prog.cs = cs;
+                vc5->dirty |= VC5_DIRTY_COMPILED_CS;
+        }
+}
+
+void
+vc5_update_compiled_shaders(struct vc5_context *vc5, uint8_t prim_mode)
+{
+        vc5_update_compiled_fs(vc5, prim_mode);
+        vc5_update_compiled_vs(vc5, prim_mode);
+}
+
+static uint32_t
+fs_cache_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(struct v3d_fs_key));
+}
+
+static uint32_t
+vs_cache_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(struct v3d_vs_key));
+}
+
+static bool
+fs_cache_compare(const void *key1, const void *key2)
+{
+        return memcmp(key1, key2, sizeof(struct v3d_fs_key)) == 0;
+}
+
+static bool
+vs_cache_compare(const void *key1, const void *key2)
+{
+        return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0;
+}
+
+static void
+delete_from_cache_if_matches(struct hash_table *ht,
+                             struct vc5_compiled_shader **last_compile,
+                             struct hash_entry *entry,
+                             struct vc5_uncompiled_shader *so)
+{
+        const struct v3d_key *key = entry->key;
+
+        if (key->shader_state == so) {
+                struct vc5_compiled_shader *shader = entry->data;
+                _mesa_hash_table_remove(ht, entry);
+                vc5_bo_unreference(&shader->bo);
+
+                if (shader == *last_compile)
+                        *last_compile = NULL;
+
+                ralloc_free(shader);
+        }
+}
+
+static void
+vc5_shader_state_delete(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_uncompiled_shader *so = hwcso;
+
+        struct hash_entry *entry;
+        hash_table_foreach(vc5->fs_cache, entry) {
+                delete_from_cache_if_matches(vc5->fs_cache, &vc5->prog.fs,
+                                             entry, so);
+        }
+        hash_table_foreach(vc5->vs_cache, entry) {
+                delete_from_cache_if_matches(vc5->vs_cache, &vc5->prog.vs,
+                                             entry, so);
+        }
+
+        ralloc_free(so->base.ir.nir);
+        free(so);
+}
+
+static void
+vc5_fp_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->prog.bind_fs = hwcso;
+        vc5->dirty |= VC5_DIRTY_UNCOMPILED_FS;
+}
+
+static void
+vc5_vp_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->prog.bind_vs = hwcso;
+        vc5->dirty |= VC5_DIRTY_UNCOMPILED_VS;
+}
+
+void
+vc5_program_init(struct pipe_context *pctx)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        pctx->create_vs_state = vc5_shader_state_create;
+        pctx->delete_vs_state = vc5_shader_state_delete;
+
+        pctx->create_fs_state = vc5_shader_state_create;
+        pctx->delete_fs_state = vc5_shader_state_delete;
+
+        pctx->bind_fs_state = vc5_fp_state_bind;
+        pctx->bind_vs_state = vc5_vp_state_bind;
+
+        vc5->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
+                                                fs_cache_compare);
+        vc5->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
+                                                vs_cache_compare);
+}
+
+void
+vc5_program_fini(struct pipe_context *pctx)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        struct hash_entry *entry;
+        hash_table_foreach(vc5->fs_cache, entry) {
+                struct vc5_compiled_shader *shader = entry->data;
+                vc5_bo_unreference(&shader->bo);
+                ralloc_free(shader);
+                _mesa_hash_table_remove(vc5->fs_cache, entry);
+        }
+
+        hash_table_foreach(vc5->vs_cache, entry) {
+                struct vc5_compiled_shader *shader = entry->data;
+                vc5_bo_unreference(&shader->bo);
+                ralloc_free(shader);
+                _mesa_hash_table_remove(vc5->vs_cache, entry);
+        }
+}
diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
new file mode 100644
index 00000000000..f645544bedf
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_query.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Gallium query object support.
+ *
+ * The HW has native support for occlusion queries, with the query result
+ * being loaded and stored by the TLB unit. From a SW perspective, we have to
+ * be careful to make sure that the jobs that need to be tracking queries are
+ * bracketed by the start and end of counting, even across FBO transitions.
+ *
+ * For the transform feedback PRIMITIVES_GENERATED/WRITTEN queries, we have to
+ * do the calculations in software at draw time.
+ */
+
+#include "v3d_context.h"
+#include "broadcom/cle/v3d_packet_v33_pack.h"
+
+struct vc5_query
+{
+        enum pipe_query_type type;
+        struct vc5_bo *bo;
+
+        uint32_t start, end;
+};
+
+static struct pipe_query *
+vc5_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index)
+{
+        struct vc5_query *q = calloc(1, sizeof(*q));
+
+        q->type = query_type;
+
+        /* Note that struct pipe_query isn't actually defined anywhere. */
+        return (struct pipe_query *)q;
+}
+
+static void
+vc5_destroy_query(struct pipe_context *pctx, struct pipe_query *query)
+{
+        struct vc5_query *q = (struct vc5_query *)query;
+
+        vc5_bo_unreference(&q->bo);
+        free(q);
+}
+
+static boolean
+vc5_begin_query(struct pipe_context *pctx, struct pipe_query *query)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_query *q = (struct vc5_query *)query;
+
+        switch (q->type) {
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+                q->start = vc5->prims_generated;
+                break;
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                q->start = vc5->tf_prims_generated;
+                break;
+        default:
+                q->bo = vc5_bo_alloc(vc5->screen, 4096, "query");
+
+                uint32_t *map = vc5_bo_map(q->bo);
+                *map = 0;
+                vc5->current_oq = q->bo;
+                vc5->dirty |= VC5_DIRTY_OQ;
+                break;
+        }
+
+        return true;
+}
+
+static bool
+vc5_end_query(struct pipe_context *pctx, struct pipe_query *query)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_query *q = (struct vc5_query *)query;
+
+        switch (q->type) {
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+                q->end = vc5->prims_generated;
+                break;
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                q->end = vc5->tf_prims_generated;
+                break;
+        default:
+                vc5->current_oq = NULL;
+                vc5->dirty |= VC5_DIRTY_OQ;
+                break;
+        }
+
+        return true;
+}
+
+static boolean
+vc5_get_query_result(struct pipe_context *pctx, struct pipe_query *query,
+                     boolean wait, union pipe_query_result *vresult)
+{
+        struct vc5_query *q = (struct vc5_query *)query;
+        uint32_t result = 0;
+
+        if (q->bo) {
+                /* XXX: Only flush the jobs using this BO. */
+                vc5_flush(pctx);
+
+                if (wait) {
+                        if (!vc5_bo_wait(q->bo, 0, "query"))
+                                return false;
+                } else {
+                        if (!vc5_bo_wait(q->bo, ~0ull, "query"))
+                                return false;
+                }
+
+                /* XXX: Sum up per-core values. */
+                uint32_t *map = vc5_bo_map(q->bo);
+                result = *map;
+
+                vc5_bo_unreference(&q->bo);
+        }
+
+        switch (q->type) {
+        case PIPE_QUERY_OCCLUSION_COUNTER:
+                vresult->u64 = result;
+                break;
+        case PIPE_QUERY_OCCLUSION_PREDICATE:
+        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+                vresult->b = result != 0;
+                break;
+        case PIPE_QUERY_PRIMITIVES_GENERATED:
+        case PIPE_QUERY_PRIMITIVES_EMITTED:
+                vresult->u64 = q->end - q->start;
+                break;
+        default:
+                unreachable("unsupported query type");
+        }
+
+        return true;
+}
+
+static void
+vc5_set_active_query_state(struct pipe_context *pctx, boolean enable)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        vc5->active_queries = enable;
+        vc5->dirty |= VC5_DIRTY_OQ;
+        vc5->dirty |= VC5_DIRTY_STREAMOUT;
+}
+
+void
+vc5_query_init(struct pipe_context *pctx)
+{
+        pctx->create_query = vc5_create_query;
+        pctx->destroy_query = vc5_destroy_query;
+        pctx->begin_query = vc5_begin_query;
+        pctx->end_query = vc5_end_query;
+        pctx->get_query_result = vc5_get_query_result;
+        pctx->set_active_query_state = vc5_set_active_query_state;
+}
+
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
new file mode 100644
index 00000000000..1cd3f1949a2
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -0,0 +1,914 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_blit.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_format_zs.h"
+
+#include "drm_fourcc.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+#include "v3d_tiling.h"
+#include "broadcom/cle/v3d_packet_v33_pack.h"
+
+static void
+vc5_debug_resource_layout(struct vc5_resource *rsc, const char *caller)
+{
+        if (!(V3D_DEBUG & V3D_DEBUG_SURFACE))
+                return;
+
+        struct pipe_resource *prsc = &rsc->base;
+
+        if (prsc->target == PIPE_BUFFER) {
+                fprintf(stderr,
+                        "rsc %s %p (format %s), %dx%d buffer @0x%08x-0x%08x\n",
+                        caller, rsc,
+                        util_format_short_name(prsc->format),
+                        prsc->width0, prsc->height0,
+                        rsc->bo->offset,
+                        rsc->bo->offset + rsc->bo->size - 1);
+                return;
+        }
+
+        static const char *const tiling_descriptions[] = {
+                [VC5_TILING_RASTER] = "R",
+                [VC5_TILING_LINEARTILE] = "LT",
+                [VC5_TILING_UBLINEAR_1_COLUMN] = "UB1",
+                [VC5_TILING_UBLINEAR_2_COLUMN] = "UB2",
+                [VC5_TILING_UIF_NO_XOR] = "UIF",
+                [VC5_TILING_UIF_XOR] = "UIF^",
+        };
+
+        for (int i = 0; i <= prsc->last_level; i++) {
+                struct vc5_resource_slice *slice = &rsc->slices[i];
+
+                int level_width = slice->stride / rsc->cpp;
+                int level_height = slice->padded_height;
+                int level_depth =
+                        u_minify(util_next_power_of_two(prsc->depth0), i);
+
+                fprintf(stderr,
+                        "rsc %s %p (format %s), %dx%d: "
+                        "level %d (%s) %dx%dx%d -> %dx%dx%d, stride %d@0x%08x\n",
+                        caller, rsc,
+                        util_format_short_name(prsc->format),
+                        prsc->width0, prsc->height0,
+                        i, tiling_descriptions[slice->tiling],
+                        u_minify(prsc->width0, i),
+                        u_minify(prsc->height0, i),
+                        u_minify(prsc->depth0, i),
+                        level_width,
+                        level_height,
+                        level_depth,
+                        slice->stride,
+                        rsc->bo->offset + slice->offset);
+        }
+}
+
+static bool
+vc5_resource_bo_alloc(struct vc5_resource *rsc)
+{
+        struct pipe_resource *prsc = &rsc->base;
+        struct pipe_screen *pscreen = prsc->screen;
+        struct vc5_bo *bo;
+
+        bo = vc5_bo_alloc(vc5_screen(pscreen), rsc->size, "resource");
+        if (bo) {
+                vc5_bo_unreference(&rsc->bo);
+                rsc->bo = bo;
+                vc5_debug_resource_layout(rsc, "alloc");
+                return true;
+        } else {
+                return false;
+        }
+}
+
+static void
+vc5_resource_transfer_unmap(struct pipe_context *pctx,
+                            struct pipe_transfer *ptrans)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_transfer *trans = vc5_transfer(ptrans);
+
+        if (trans->map) {
+                struct vc5_resource *rsc = vc5_resource(ptrans->resource);
+                struct vc5_resource_slice *slice = &rsc->slices[ptrans->level];
+
+                if (ptrans->usage & PIPE_TRANSFER_WRITE) {
+                        for (int z = 0; z < ptrans->box.depth; z++) {
+                                void *dst = rsc->bo->map +
+                                        vc5_layer_offset(&rsc->base,
+                                                         ptrans->level,
+                                                         ptrans->box.z + z);
+                                vc5_store_tiled_image(dst,
+                                                      slice->stride,
+                                                      (trans->map +
+                                                       ptrans->stride *
+                                                       ptrans->box.height * z),
+                                                      ptrans->stride,
+                                                      slice->tiling, rsc->cpp,
+                                                      slice->padded_height,
+                                                      &ptrans->box);
+                        }
+                }
+                free(trans->map);
+        }
+
+        pipe_resource_reference(&ptrans->resource, NULL);
+        slab_free(&vc5->transfer_pool, ptrans);
+}
+
+static void *
+vc5_resource_transfer_map(struct pipe_context *pctx,
+                          struct pipe_resource *prsc,
+                          unsigned level, unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **pptrans)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct vc5_transfer *trans;
+        struct pipe_transfer *ptrans;
+        enum pipe_format format = prsc->format;
+        char *buf;
+
+        /* MSAA maps should have been handled by u_transfer_helper. */
+        assert(prsc->nr_samples <= 1);
+
+        /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is
+         * being mapped.
+         */
+        if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+            !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) &&
+            prsc->last_level == 0 &&
+            prsc->width0 == box->width &&
+            prsc->height0 == box->height &&
+            prsc->depth0 == box->depth &&
+            prsc->array_size == 1 &&
+            rsc->bo->private) {
+                usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+        }
+
+        if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
+                if (vc5_resource_bo_alloc(rsc)) {
+                        /* If it might be bound as one of our vertex buffers
+                         * or UBOs, make sure we re-emit vertex buffer state
+                         * or uniforms.
+                         */
+                        if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                vc5->dirty |= VC5_DIRTY_VTXBUF;
+                        if (prsc->bind & PIPE_BIND_CONSTANT_BUFFER)
+                                vc5->dirty |= VC5_DIRTY_CONSTBUF;
+                } else {
+                        /* If we failed to reallocate, flush users so that we
+                         * don't violate any syncing requirements.
+                         */
+                        vc5_flush_jobs_reading_resource(vc5, prsc);
+                }
+        } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+                /* If we're writing and the buffer is being used by the CL, we
+                 * have to flush the CL first.  If we're only reading, we need
+                 * to flush if the CL has written our buffer.
+                 */
+                if (usage & PIPE_TRANSFER_WRITE)
+                        vc5_flush_jobs_reading_resource(vc5, prsc);
+                else
+                        vc5_flush_jobs_writing_resource(vc5, prsc);
+        }
+
+        if (usage & PIPE_TRANSFER_WRITE) {
+                rsc->writes++;
+                rsc->initialized_buffers = ~0;
+        }
+
+        trans = slab_alloc(&vc5->transfer_pool);
+        if (!trans)
+                return NULL;
+
+        /* XXX: Handle DONTBLOCK, DISCARD_RANGE, PERSISTENT, COHERENT. */
+
+        /* slab_alloc_st() doesn't zero: */
+        memset(trans, 0, sizeof(*trans));
+        ptrans = &trans->base;
+
+        pipe_resource_reference(&ptrans->resource, prsc);
+        ptrans->level = level;
+        ptrans->usage = usage;
+        ptrans->box = *box;
+
+        /* Note that the current kernel implementation is synchronous, so no
+         * need to do syncing stuff here yet.
+         */
+
+        if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
+                buf = vc5_bo_map_unsynchronized(rsc->bo);
+        else
+                buf = vc5_bo_map(rsc->bo);
+        if (!buf) {
+                fprintf(stderr, "Failed to map bo\n");
+                goto fail;
+        }
+
+        *pptrans = ptrans;
+
+        /* Our load/store routines work on entire compressed blocks. */
+        ptrans->box.x /= util_format_get_blockwidth(format);
+        ptrans->box.y /= util_format_get_blockheight(format);
+        ptrans->box.width = DIV_ROUND_UP(ptrans->box.width,
+                                         util_format_get_blockwidth(format));
+        ptrans->box.height = DIV_ROUND_UP(ptrans->box.height,
+                                          util_format_get_blockheight(format));
+
+        struct vc5_resource_slice *slice = &rsc->slices[level];
+        if (rsc->tiled) {
+                /* No direct mappings of tiled, since we need to manually
+                 * tile/untile.
+                 */
+                if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
+                        return NULL;
+
+                ptrans->stride = ptrans->box.width * rsc->cpp;
+                ptrans->layer_stride = ptrans->stride * ptrans->box.height;
+
+                trans->map = malloc(ptrans->layer_stride * ptrans->box.depth);
+
+                if (usage & PIPE_TRANSFER_READ) {
+                        for (int z = 0; z < ptrans->box.depth; z++) {
+                                void *src = rsc->bo->map +
+                                        vc5_layer_offset(&rsc->base,
+                                                         ptrans->level,
+                                                         ptrans->box.z + z);
+                                vc5_load_tiled_image((trans->map +
+                                                      ptrans->stride *
+                                                      ptrans->box.height * z),
+                                                     ptrans->stride,
+                                                     src,
+                                                     slice->stride,
+                                                     slice->tiling, rsc->cpp,
+                                                     slice->padded_height,
+                                             &ptrans->box);
+                        }
+                }
+                return trans->map;
+        } else {
+                ptrans->stride = slice->stride;
+                ptrans->layer_stride = ptrans->stride;
+
+                return buf + slice->offset +
+                        ptrans->box.y * ptrans->stride +
+                        ptrans->box.x * rsc->cpp +
+                        ptrans->box.z * rsc->cube_map_stride;
+        }
+
+
+fail:
+        vc5_resource_transfer_unmap(pctx, ptrans);
+        return NULL;
+}
+
+static void
+vc5_resource_destroy(struct pipe_screen *pscreen,
+                     struct pipe_resource *prsc)
+{
+        struct vc5_resource *rsc = vc5_resource(prsc);
+
+        vc5_bo_unreference(&rsc->bo);
+        free(rsc);
+}
+
+static boolean
+vc5_resource_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_context *pctx,
+                        struct pipe_resource *prsc,
+                        struct winsys_handle *whandle,
+                        unsigned usage)
+{
+        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct vc5_bo *bo = rsc->bo;
+
+        whandle->stride = rsc->slices[0].stride;
+
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching).
+         */
+        bo->private = false;
+
+        switch (whandle->type) {
+        case DRM_API_HANDLE_TYPE_SHARED:
+                return vc5_bo_flink(bo, &whandle->handle);
+        case DRM_API_HANDLE_TYPE_KMS:
+                whandle->handle = bo->handle;
+                return TRUE;
+        case DRM_API_HANDLE_TYPE_FD:
+                whandle->handle = vc5_bo_get_dmabuf(bo);
+                return whandle->handle != -1;
+        }
+
+        return FALSE;
+}
+
+#define PAGE_UB_ROWS (VC5_UIFCFG_PAGE_SIZE / VC5_UIFBLOCK_ROW_SIZE)
+#define PAGE_UB_ROWS_TIMES_1_5 ((PAGE_UB_ROWS * 3) >> 1)
+#define PAGE_CACHE_UB_ROWS (VC5_PAGE_CACHE_SIZE / VC5_UIFBLOCK_ROW_SIZE)
+#define PAGE_CACHE_MINUS_1_5_UB_ROWS (PAGE_CACHE_UB_ROWS - PAGE_UB_ROWS_TIMES_1_5)
+
+/**
+ * Computes the HW's UIFblock padding for a given height/cpp.
+ *
+ * The goal of the padding is to keep pages of the same color (bank number) at
+ * least half a page away from each other vertically when crossing between
+ * between columns of UIF blocks.
+ */
+static uint32_t
+vc5_get_ub_pad(struct vc5_resource *rsc, uint32_t height)
+{
+        uint32_t utile_h = vc5_utile_height(rsc->cpp);
+        uint32_t uif_block_h = utile_h * 2;
+        uint32_t height_ub = height / uif_block_h;
+
+        uint32_t height_offset_in_pc = height_ub % PAGE_CACHE_UB_ROWS;
+
+        /* For the perfectly-aligned-for-UIF-XOR case, don't add any pad. */
+        if (height_offset_in_pc == 0)
+                return 0;
+
+        /* Try padding up to where we're offset by at least half a page. */
+        if (height_offset_in_pc < PAGE_UB_ROWS_TIMES_1_5) {
+                /* If we fit entirely in the page cache, don't pad. */
+                if (height_ub < PAGE_CACHE_UB_ROWS)
+                        return 0;
+                else
+                        return PAGE_UB_ROWS_TIMES_1_5 - height_offset_in_pc;
+        }
+
+        /* If we're close to being aligned to page cache size, then round up
+         * and rely on XOR.
+         */
+        if (height_offset_in_pc > PAGE_CACHE_MINUS_1_5_UB_ROWS)
+                return PAGE_CACHE_UB_ROWS - height_offset_in_pc;
+
+        /* Otherwise, we're far enough away (top and bottom) to not need any
+         * padding.
+         */
+        return 0;
+}
+
+static void
+vc5_setup_slices(struct vc5_resource *rsc)
+{
+        struct pipe_resource *prsc = &rsc->base;
+        uint32_t width = prsc->width0;
+        uint32_t height = prsc->height0;
+        uint32_t depth = prsc->depth0;
+        /* Note that power-of-two padding is based on level 1.  These are not
+         * equivalent to just util_next_power_of_two(dimension), because at a
+         * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
+         * not 8.
+         */
+        uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1));
+        uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1));
+        uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1));
+        uint32_t offset = 0;
+        uint32_t utile_w = vc5_utile_width(rsc->cpp);
+        uint32_t utile_h = vc5_utile_height(rsc->cpp);
+        uint32_t uif_block_w = utile_w * 2;
+        uint32_t uif_block_h = utile_h * 2;
+        uint32_t block_width = util_format_get_blockwidth(prsc->format);
+        uint32_t block_height = util_format_get_blockheight(prsc->format);
+        bool msaa = prsc->nr_samples > 1;
+        /* MSAA textures/renderbuffers are always laid out as single-level
+         * UIF.
+         */
+        bool uif_top = msaa;
+
+        for (int i = prsc->last_level; i >= 0; i--) {
+                struct vc5_resource_slice *slice = &rsc->slices[i];
+
+                uint32_t level_width, level_height, level_depth;
+                if (i < 2) {
+                        level_width = u_minify(width, i);
+                        level_height = u_minify(height, i);
+                } else {
+                        level_width = u_minify(pot_width, i);
+                        level_height = u_minify(pot_height, i);
+                }
+                if (i < 1)
+                        level_depth = u_minify(depth, i);
+                else
+                        level_depth = u_minify(pot_depth, i);
+
+                if (msaa) {
+                        level_width *= 2;
+                        level_height *= 2;
+                }
+
+                level_width = DIV_ROUND_UP(level_width, block_width);
+                level_height = DIV_ROUND_UP(level_height, block_height);
+
+                if (!rsc->tiled) {
+                        slice->tiling = VC5_TILING_RASTER;
+                        if (prsc->target == PIPE_TEXTURE_1D)
+                                level_width = align(level_width, 64 / rsc->cpp);
+                } else {
+                        if ((i != 0 || !uif_top) &&
+                            (level_width <= utile_w ||
+                             level_height <= utile_h)) {
+                                slice->tiling = VC5_TILING_LINEARTILE;
+                                level_width = align(level_width, utile_w);
+                                level_height = align(level_height, utile_h);
+                        } else if ((i != 0 || !uif_top) &&
+                                   level_width <= uif_block_w) {
+                                slice->tiling = VC5_TILING_UBLINEAR_1_COLUMN;
+                                level_width = align(level_width, uif_block_w);
+                                level_height = align(level_height, uif_block_h);
+                        } else if ((i != 0 || !uif_top) &&
+                                   level_width <= 2 * uif_block_w) {
+                                slice->tiling = VC5_TILING_UBLINEAR_2_COLUMN;
+                                level_width = align(level_width, 2 * uif_block_w);
+                                level_height = align(level_height, uif_block_h);
+                        } else {
+                                /* We align the width to a 4-block column of
+                                 * UIF blocks, but we only align height to UIF
+                                 * blocks.
+                                 */
+                                level_width = align(level_width,
+                                                    4 * uif_block_w);
+                                level_height = align(level_height,
+                                                     uif_block_h);
+
+                                slice->ub_pad = vc5_get_ub_pad(rsc,
+                                                               level_height);
+                                level_height += slice->ub_pad * uif_block_h;
+
+                                /* If the padding set us to to be aligned to
+                                 * the page cache size, then the HW will use
+                                 * the XOR bit on odd columns to get us
+                                 * perfectly misaligned
+                                 */
+                                if ((level_height / uif_block_h) %
+                                    (VC5_PAGE_CACHE_SIZE /
+                                     VC5_UIFBLOCK_ROW_SIZE) == 0) {
+                                        slice->tiling = VC5_TILING_UIF_XOR;
+                                } else {
+                                        slice->tiling = VC5_TILING_UIF_NO_XOR;
+                                }
+                        }
+                }
+
+                slice->offset = offset;
+                slice->stride = level_width * rsc->cpp;
+                slice->padded_height = level_height;
+                slice->size = level_height * slice->stride;
+
+                uint32_t slice_total_size = slice->size * level_depth;
+
+                /* The HW aligns level 1's base to a page if any of level 1 or
+                 * below could be UIF XOR.  The lower levels then inherit the
+                 * alignment for as long as necesary, thanks to being power of
+                 * two aligned.
+                 */
+                if (i == 1 &&
+                    level_width > 4 * uif_block_w &&
+                    level_height > PAGE_CACHE_MINUS_1_5_UB_ROWS * uif_block_h) {
+                        slice_total_size = align(slice_total_size,
+                                                 VC5_UIFCFG_PAGE_SIZE);
+                }
+
+                offset += slice_total_size;
+
+        }
+        rsc->size = offset;
+
+        /* UIF/UBLINEAR levels need to be aligned to UIF-blocks, and LT only
+         * needs to be aligned to utile boundaries.  Since tiles are laid out
+         * from small to big in memory, we need to align the later UIF slices
+         * to UIF blocks, if they were preceded by non-UIF-block-aligned LT
+         * slices.
+         *
+         * We additionally align to 4k, which improves UIF XOR performance.
+         */
+        uint32_t page_align_offset = (align(rsc->slices[0].offset, 4096) -
+                                      rsc->slices[0].offset);
+        if (page_align_offset) {
+                rsc->size += page_align_offset;
+                for (int i = 0; i <= prsc->last_level; i++)
+                        rsc->slices[i].offset += page_align_offset;
+        }
+
+        /* Arrays and cube textures have a stride which is the distance from
+         * one full mipmap tree to the next (64b aligned).  For 3D textures,
+         * we need to program the stride between slices of miplevel 0.
+         */
+        if (prsc->target != PIPE_TEXTURE_3D) {
+                rsc->cube_map_stride = align(rsc->slices[0].offset +
+                                             rsc->slices[0].size, 64);
+                rsc->size += rsc->cube_map_stride * (prsc->array_size - 1);
+        } else {
+                rsc->cube_map_stride = rsc->slices[0].size;
+        }
+}
+
+uint32_t
+vc5_layer_offset(struct pipe_resource *prsc, uint32_t level, uint32_t layer)
+{
+        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct vc5_resource_slice *slice = &rsc->slices[level];
+
+        if (prsc->target == PIPE_TEXTURE_3D)
+                return slice->offset + layer * slice->size;
+        else
+                return slice->offset + layer * rsc->cube_map_stride;
+}
+
+static struct vc5_resource *
+vc5_resource_setup(struct pipe_screen *pscreen,
+                   const struct pipe_resource *tmpl)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_resource *rsc = CALLOC_STRUCT(vc5_resource);
+        if (!rsc)
+                return NULL;
+        struct pipe_resource *prsc = &rsc->base;
+
+        *prsc = *tmpl;
+
+        pipe_reference_init(&prsc->reference, 1);
+        prsc->screen = pscreen;
+
+        if (prsc->nr_samples <= 1 ||
+            screen->devinfo.ver >= 40 ||
+            util_format_is_depth_or_stencil(prsc->format)) {
+                rsc->cpp = util_format_get_blocksize(prsc->format);
+                if (screen->devinfo.ver < 40 && prsc->nr_samples > 1)
+                        rsc->cpp *= prsc->nr_samples;
+        } else {
+                assert(vc5_rt_format_supported(&screen->devinfo, prsc->format));
+                uint32_t output_image_format =
+                        vc5_get_rt_format(&screen->devinfo, prsc->format);
+                uint32_t internal_type;
+                uint32_t internal_bpp;
+                vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                                                            output_image_format,
+                                                            &internal_type,
+                                                            &internal_bpp);
+                switch (internal_bpp) {
+                case V3D_INTERNAL_BPP_32:
+                        rsc->cpp = 4;
+                        break;
+                case V3D_INTERNAL_BPP_64:
+                        rsc->cpp = 8;
+                        break;
+                case V3D_INTERNAL_BPP_128:
+                        rsc->cpp = 16;
+                        break;
+                }
+        }
+
+        assert(rsc->cpp);
+
+        return rsc;
+}
+
+static bool
+find_modifier(uint64_t needle, const uint64_t *haystack, int count)
+{
+        int i;
+
+        for (i = 0; i < count; i++) {
+                if (haystack[i] == needle)
+                        return true;
+        }
+
+        return false;
+}
+
+static struct pipe_resource *
+vc5_resource_create_with_modifiers(struct pipe_screen *pscreen,
+                                   const struct pipe_resource *tmpl,
+                                   const uint64_t *modifiers,
+                                   int count)
+{
+        bool linear_ok = find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count);
+        struct vc5_resource *rsc = vc5_resource_setup(pscreen, tmpl);
+        struct pipe_resource *prsc = &rsc->base;
+        /* Use a tiled layout if we can, for better 3D performance. */
+        bool should_tile = true;
+
+        /* VBOs/PBOs are untiled (and 1 height). */
+        if (tmpl->target == PIPE_BUFFER)
+                should_tile = false;
+
+        /* Cursors are always linear, and the user can request linear as well.
+         */
+        if (tmpl->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR))
+                should_tile = false;
+
+        /* 1D and 1D_ARRAY textures are always raster-order. */
+        if (tmpl->target == PIPE_TEXTURE_1D ||
+            tmpl->target == PIPE_TEXTURE_1D_ARRAY)
+                should_tile = false;
+
+        /* Scanout BOs for simulator need to be linear for interaction with
+         * i965.
+         */
+        if (using_vc5_simulator &&
+            tmpl->bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+                should_tile = false;
+
+        /* No user-specified modifier; determine our own. */
+        if (count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID) {
+                linear_ok = true;
+                rsc->tiled = should_tile;
+        } else if (should_tile &&
+                   find_modifier(DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED,
+                                 modifiers, count)) {
+                rsc->tiled = true;
+        } else if (linear_ok) {
+                rsc->tiled = false;
+        } else {
+                fprintf(stderr, "Unsupported modifier requested\n");
+                return NULL;
+        }
+
+        rsc->internal_format = prsc->format;
+
+        vc5_setup_slices(rsc);
+        if (!vc5_resource_bo_alloc(rsc))
+                goto fail;
+
+        return prsc;
+fail:
+        vc5_resource_destroy(pscreen, prsc);
+        return NULL;
+}
+
+struct pipe_resource *
+vc5_resource_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *tmpl)
+{
+        const uint64_t mod = DRM_FORMAT_MOD_INVALID;
+        return vc5_resource_create_with_modifiers(pscreen, tmpl, &mod, 1);
+}
+
+static struct pipe_resource *
+vc5_resource_from_handle(struct pipe_screen *pscreen,
+                         const struct pipe_resource *tmpl,
+                         struct winsys_handle *whandle,
+                         unsigned usage)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct vc5_resource *rsc = vc5_resource_setup(pscreen, tmpl);
+        struct pipe_resource *prsc = &rsc->base;
+        struct vc5_resource_slice *slice = &rsc->slices[0];
+
+        if (!rsc)
+                return NULL;
+
+        switch (whandle->modifier) {
+        case DRM_FORMAT_MOD_LINEAR:
+        case DRM_FORMAT_MOD_INVALID:
+                rsc->tiled = false;
+                break;
+        /* XXX: UIF */
+        default:
+                fprintf(stderr,
+                        "Attempt to import unsupported modifier 0x%llx\n",
+                        (long long)whandle->modifier);
+                goto fail;
+        }
+
+        if (whandle->offset != 0) {
+                fprintf(stderr,
+                        "Attempt to import unsupported winsys offset %u\n",
+                        whandle->offset);
+                goto fail;
+        }
+
+        switch (whandle->type) {
+        case DRM_API_HANDLE_TYPE_SHARED:
+                rsc->bo = vc5_bo_open_name(screen,
+                                           whandle->handle, whandle->stride);
+                break;
+        case DRM_API_HANDLE_TYPE_FD:
+                rsc->bo = vc5_bo_open_dmabuf(screen,
+                                             whandle->handle, whandle->stride);
+                break;
+        default:
+                fprintf(stderr,
+                        "Attempt to import unsupported handle type %d\n",
+                        whandle->type);
+                goto fail;
+        }
+
+        if (!rsc->bo)
+                goto fail;
+
+        rsc->internal_format = prsc->format;
+
+        vc5_setup_slices(rsc);
+        vc5_debug_resource_layout(rsc, "import");
+
+        if (whandle->stride != slice->stride) {
+                static bool warned = false;
+                if (!warned) {
+                        warned = true;
+                        fprintf(stderr,
+                                "Attempting to import %dx%d %s with "
+                                "unsupported stride %d instead of %d\n",
+                                prsc->width0, prsc->height0,
+                                util_format_short_name(prsc->format),
+                                whandle->stride,
+                                slice->stride);
+                }
+                goto fail;
+        }
+
+        return prsc;
+
+fail:
+        vc5_resource_destroy(pscreen, prsc);
+        return NULL;
+}
+
+static struct pipe_surface *
+vc5_create_surface(struct pipe_context *pctx,
+                   struct pipe_resource *ptex,
+                   const struct pipe_surface *surf_tmpl)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_screen *screen = vc5->screen;
+        struct vc5_surface *surface = CALLOC_STRUCT(vc5_surface);
+        struct vc5_resource *rsc = vc5_resource(ptex);
+
+        if (!surface)
+                return NULL;
+
+        assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
+
+        struct pipe_surface *psurf = &surface->base;
+        unsigned level = surf_tmpl->u.tex.level;
+        struct vc5_resource_slice *slice = &rsc->slices[level];
+
+        pipe_reference_init(&psurf->reference, 1);
+        pipe_resource_reference(&psurf->texture, ptex);
+
+        psurf->context = pctx;
+        psurf->format = surf_tmpl->format;
+        psurf->width = u_minify(ptex->width0, level);
+        psurf->height = u_minify(ptex->height0, level);
+        psurf->u.tex.level = level;
+        psurf->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
+        psurf->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
+
+        surface->offset = vc5_layer_offset(ptex, level,
+                                           psurf->u.tex.first_layer);
+        surface->tiling = slice->tiling;
+
+        surface->format = vc5_get_rt_format(&screen->devinfo, psurf->format);
+
+        if (util_format_is_depth_or_stencil(psurf->format)) {
+                switch (psurf->format) {
+                case PIPE_FORMAT_Z16_UNORM:
+                        surface->internal_type = V3D_INTERNAL_TYPE_DEPTH_16;
+                        break;
+                case PIPE_FORMAT_Z32_FLOAT:
+                case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+                        surface->internal_type = V3D_INTERNAL_TYPE_DEPTH_32F;
+                        break;
+                default:
+                        surface->internal_type = V3D_INTERNAL_TYPE_DEPTH_24;
+                }
+        } else {
+                uint32_t bpp, type;
+                vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                                                            surface->format,
+                                                            &type, &bpp);
+                surface->internal_type = type;
+                surface->internal_bpp = bpp;
+        }
+
+        if (surface->tiling == VC5_TILING_UIF_NO_XOR ||
+            surface->tiling == VC5_TILING_UIF_XOR) {
+                surface->padded_height_of_output_image_in_uif_blocks =
+                        (slice->padded_height /
+                         (2 * vc5_utile_height(rsc->cpp)));
+        }
+
+        if (rsc->separate_stencil) {
+                surface->separate_stencil =
+                        vc5_create_surface(pctx, &rsc->separate_stencil->base,
+                                           surf_tmpl);
+        }
+
+        return &surface->base;
+}
+
+static void
+vc5_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf)
+{
+        struct vc5_surface *surf = vc5_surface(psurf);
+
+        if (surf->separate_stencil)
+                pipe_surface_reference(&surf->separate_stencil, NULL);
+
+        pipe_resource_reference(&psurf->texture, NULL);
+        FREE(psurf);
+}
+
+static void
+vc5_flush_resource(struct pipe_context *pctx, struct pipe_resource *resource)
+{
+        /* All calls to flush_resource are followed by a flush of the context,
+         * so there's nothing to do.
+         */
+}
+
+static enum pipe_format
+vc5_resource_get_internal_format(struct pipe_resource *prsc)
+{
+        return vc5_resource(prsc)->internal_format;
+}
+
+static void
+vc5_resource_set_stencil(struct pipe_resource *prsc,
+                         struct pipe_resource *stencil)
+{
+        vc5_resource(prsc)->separate_stencil = vc5_resource(stencil);
+}
+
+static struct pipe_resource *
+vc5_resource_get_stencil(struct pipe_resource *prsc)
+{
+        struct vc5_resource *rsc = vc5_resource(prsc);
+
+        return &rsc->separate_stencil->base;
+}
+
+static const struct u_transfer_vtbl transfer_vtbl = {
+        .resource_create          = vc5_resource_create,
+        .resource_destroy         = vc5_resource_destroy,
+        .transfer_map             = vc5_resource_transfer_map,
+        .transfer_unmap           = vc5_resource_transfer_unmap,
+        .transfer_flush_region    = u_default_transfer_flush_region,
+        .get_internal_format      = vc5_resource_get_internal_format,
+        .set_stencil              = vc5_resource_set_stencil,
+        .get_stencil              = vc5_resource_get_stencil,
+};
+
+void
+vc5_resource_screen_init(struct pipe_screen *pscreen)
+{
+        pscreen->resource_create_with_modifiers =
+                vc5_resource_create_with_modifiers;
+        pscreen->resource_create = u_transfer_helper_resource_create;
+        pscreen->resource_from_handle = vc5_resource_from_handle;
+        pscreen->resource_get_handle = vc5_resource_get_handle;
+        pscreen->resource_destroy = u_transfer_helper_resource_destroy;
+        pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl,
+                                                            true, true, true);
+}
+
+void
+vc5_resource_context_init(struct pipe_context *pctx)
+{
+        pctx->transfer_map = u_transfer_helper_transfer_map;
+        pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
+        pctx->transfer_unmap = u_transfer_helper_transfer_unmap;
+        pctx->buffer_subdata = u_default_buffer_subdata;
+        pctx->texture_subdata = u_default_texture_subdata;
+        pctx->create_surface = vc5_create_surface;
+        pctx->surface_destroy = vc5_surface_destroy;
+        pctx->resource_copy_region = util_resource_copy_region;
+        pctx->blit = vc5_blit;
+        pctx->flush_resource = vc5_flush_resource;
+}
diff --git a/src/gallium/drivers/v3d/v3d_resource.h b/src/gallium/drivers/v3d/v3d_resource.h
new file mode 100644
index 00000000000..dc68f803e90
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_resource.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_RESOURCE_H
+#define VC5_RESOURCE_H
+
+#include "v3d_screen.h"
+#include "util/u_transfer.h"
+
+/* A UIFblock is a 256-byte region of memory that's 256-byte aligned.  These
+ * will be grouped in 4x4 blocks (left-to-right, then top-to-bottom) in a 4KB
+ * page.  Those pages are then arranged left-to-right, top-to-bottom, to cover
+ * an image.
+ *
+ * The inside of a UIFblock, for packed pixels, will be split into 4 64-byte
+ * utiles.  Utiles may be 8x8 (8bpp), 8x4(16bpp) or 4x4 (32bpp).
+ */
+
+/**
+ * Tiling mode enum used for vc5_resource.c, which maps directly to the Memory
+ * Format field of render target and Z/Stencil config.
+ */
+enum vc5_tiling_mode {
+        /* Untiled resources.  Not valid as texture inputs. */
+        VC5_TILING_RASTER,
+
+        /* Single line of u-tiles. */
+        VC5_TILING_LINEARTILE,
+
+        /* Departure from standard 4-UIF block column format. */
+        VC5_TILING_UBLINEAR_1_COLUMN,
+
+        /* Departure from standard 4-UIF block column format. */
+        VC5_TILING_UBLINEAR_2_COLUMN,
+
+        /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
+         * split 2x2 into utiles.
+         */
+        VC5_TILING_UIF_NO_XOR,
+
+        /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
+         * split 2x2 into utiles.
+         */
+        VC5_TILING_UIF_XOR,
+};
+
+struct vc5_transfer {
+        struct pipe_transfer base;
+        void *map;
+};
+
+struct vc5_resource_slice {
+        uint32_t offset;
+        uint32_t stride;
+        uint32_t padded_height;
+        /* Size of a single pane of the slice.  For 3D textures, there will be
+         * a number of panes equal to the minified, power-of-two-aligned
+         * depth.
+         */
+        uint32_t size;
+        uint8_t ub_pad;
+        enum vc5_tiling_mode tiling;
+};
+
+struct vc5_surface {
+        struct pipe_surface base;
+        uint32_t offset;
+        enum vc5_tiling_mode tiling;
+        /**
+         * Output image format for TILE_RENDERING_MODE_CONFIGURATION
+         */
+        uint8_t format;
+
+        /**
+         * Internal format of the tile buffer for
+         * TILE_RENDERING_MODE_CONFIGURATION.
+         */
+        uint8_t internal_type;
+
+        /**
+         * internal bpp value (0=32bpp, 2=128bpp) for color buffers in
+         * TILE_RENDERING_MODE_CONFIGURATION.
+         */
+        uint8_t internal_bpp;
+
+        uint32_t padded_height_of_output_image_in_uif_blocks;
+
+        /* If the resource being referenced is separate stencil, then this is
+         * the surface to use when reading/writing stencil.
+         */
+        struct pipe_surface *separate_stencil;
+};
+
+struct vc5_resource {
+        struct pipe_resource base;
+        struct vc5_bo *bo;
+        struct vc5_resource_slice slices[VC5_MAX_MIP_LEVELS];
+        uint32_t cube_map_stride;
+        uint32_t size;
+        int cpp;
+        bool tiled;
+
+        /**
+         * Number of times the resource has been written to.
+         *
+         * This is used to track whether we need to load the surface on first
+         * rendering.
+         */
+        uint64_t writes;
+
+        /**
+         * Bitmask of PIPE_CLEAR_COLOR0, PIPE_CLEAR_DEPTH, PIPE_CLEAR_STENCIL
+         * for which parts of the resource are defined.
+         *
+         * Used for avoiding fallback to quad clears for clearing just depth,
+         * when the stencil contents have never been initialized.  Note that
+         * we're lazy and fields not present in the buffer (DEPTH in a color
+         * buffer) may get marked.
+         */
+        uint32_t initialized_buffers;
+
+        enum pipe_format internal_format;
+
+        /* Resource storing the S8 part of a Z32F_S8 resource, or NULL. */
+        struct vc5_resource *separate_stencil;
+};
+
+static inline struct vc5_resource *
+vc5_resource(struct pipe_resource *prsc)
+{
+        return (struct vc5_resource *)prsc;
+}
+
+static inline struct vc5_surface *
+vc5_surface(struct pipe_surface *psurf)
+{
+        return (struct vc5_surface *)psurf;
+}
+
+static inline struct vc5_transfer *
+vc5_transfer(struct pipe_transfer *ptrans)
+{
+        return (struct vc5_transfer *)ptrans;
+}
+
+void vc5_resource_screen_init(struct pipe_screen *pscreen);
+void vc5_resource_context_init(struct pipe_context *pctx);
+struct pipe_resource *vc5_resource_create(struct pipe_screen *pscreen,
+                                          const struct pipe_resource *tmpl);
+uint32_t vc5_layer_offset(struct pipe_resource *prsc, uint32_t level,
+                          uint32_t layer);
+
+
+#endif /* VC5_RESOURCE_H */
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
new file mode 100644
index 00000000000..95e6a6907f4
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "os/os_misc.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_hash_table.h"
+#include "util/ralloc.h"
+
+#include <xf86drm.h>
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+#include "compiler/v3d_compiler.h"
+
+static const char *
+vc5_screen_get_name(struct pipe_screen *pscreen)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+
+        if (!screen->name) {
+                screen->name = ralloc_asprintf(screen,
+                                               "VC5 V3D %d.%d",
+                                               screen->devinfo.ver / 10,
+                                               screen->devinfo.ver % 10);
+        }
+
+        return screen->name;
+}
+
+static const char *
+vc5_screen_get_vendor(struct pipe_screen *pscreen)
+{
+        return "Broadcom";
+}
+
+static void
+vc5_screen_destroy(struct pipe_screen *pscreen)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+
+        util_hash_table_destroy(screen->bo_handles);
+        vc5_bufmgr_destroy(pscreen);
+        slab_destroy_parent(&screen->transfer_pool);
+
+        if (using_vc5_simulator)
+                vc5_simulator_destroy(screen);
+
+        v3d_compiler_free(screen->compiler);
+
+        close(screen->fd);
+        ralloc_free(pscreen);
+}
+
+static int
+vc5_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+
+        switch (param) {
+                /* Supported features (boolean caps). */
+        case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+        case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+        case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+        case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+        case PIPE_CAP_NPOT_TEXTURES:
+        case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+        case PIPE_CAP_TEXTURE_MULTISAMPLE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
+        case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+        case PIPE_CAP_START_INSTANCE:
+        case PIPE_CAP_TGSI_INSTANCEID:
+        case PIPE_CAP_SM3:
+        case PIPE_CAP_TEXTURE_QUERY_LOD:
+        case PIPE_CAP_PRIMITIVE_RESTART:
+        case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+        case PIPE_CAP_OCCLUSION_QUERY:
+        case PIPE_CAP_POINT_SPRITE:
+        case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+        case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+        case PIPE_CAP_COMPUTE:
+        case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+        case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+                return 1;
+
+        case PIPE_CAP_INDEP_BLEND_ENABLE:
+                return screen->devinfo.ver >= 40;
+
+        case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+                return 256;
+
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+                return 4;
+
+        case PIPE_CAP_GLSL_FEATURE_LEVEL:
+                return 400;
+
+        case PIPE_CAP_MAX_VIEWPORTS:
+                return 1;
+
+        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+                return 1;
+        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+                return 0;
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+                if (screen->devinfo.ver >= 40)
+                        return 0;
+                else
+                        return 1;
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+                if (screen->devinfo.ver >= 40)
+                        return 1;
+                else
+                        return 0;
+
+        case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+        case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+        case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+                return 1;
+
+
+                /* Stream output. */
+        case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+                return 4;
+        case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+        case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+                return 64;
+
+        case PIPE_CAP_MIN_TEXEL_OFFSET:
+        case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+                return -8;
+        case PIPE_CAP_MAX_TEXEL_OFFSET:
+        case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+                return 7;
+
+                /* Unsupported features. */
+        case PIPE_CAP_ANISOTROPIC_FILTER:
+        case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+        case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+        case PIPE_CAP_CUBE_MAP_ARRAY:
+        case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+        case PIPE_CAP_SEAMLESS_CUBE_MAP:
+        case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+        case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+        case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+        case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+        case PIPE_CAP_SHADER_STENCIL_EXPORT:
+        case PIPE_CAP_TGSI_TEXCOORD:
+        case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+        case PIPE_CAP_CONDITIONAL_RENDER:
+        case PIPE_CAP_TEXTURE_BARRIER:
+        case PIPE_CAP_INDEP_BLEND_FUNC:
+        case PIPE_CAP_DEPTH_CLIP_DISABLE:
+        case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+        case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+        case PIPE_CAP_USER_VERTEX_BUFFERS:
+        case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+        case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+        case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+        case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+        case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+        case PIPE_CAP_TEXTURE_GATHER_SM5:
+        case PIPE_CAP_FAKE_SW_MSAA:
+        case PIPE_CAP_SAMPLE_SHADING:
+        case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+        case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+        case PIPE_CAP_MAX_VERTEX_STREAMS:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+        case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+        case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+        case PIPE_CAP_SAMPLER_VIEW_TARGET:
+        case PIPE_CAP_CLIP_HALFZ:
+        case PIPE_CAP_VERTEXID_NOBASE:
+        case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+        case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+        case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
+        case PIPE_CAP_TGSI_TXQS:
+        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_INVALIDATE_BUFFER:
+        case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
+        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+        case PIPE_CAP_QUERY_BUFFER_OBJECT:
+        case PIPE_CAP_QUERY_MEMORY_INFO:
+        case PIPE_CAP_PCI_GROUP:
+        case PIPE_CAP_PCI_BUS:
+        case PIPE_CAP_PCI_DEVICE:
+        case PIPE_CAP_PCI_FUNCTION:
+        case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+        case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+        case PIPE_CAP_CULL_DISTANCE:
+        case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
+        case PIPE_CAP_TGSI_VOTE:
+        case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+        case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+        case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+        case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+        case PIPE_CAP_TGSI_FS_FBFETCH:
+        case PIPE_CAP_INT64:
+        case PIPE_CAP_INT64_DIVMOD:
+        case PIPE_CAP_DOUBLES:
+        case PIPE_CAP_BINDLESS_TEXTURE:
+        case PIPE_CAP_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+        case PIPE_CAP_TGSI_BALLOT:
+        case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+        case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+        case PIPE_CAP_TGSI_CLOCK:
+        case PIPE_CAP_TGSI_TEX_TXF_LZ:
+        case PIPE_CAP_NATIVE_FENCE_FD:
+        case PIPE_CAP_FENCE_SIGNAL:
+        case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+        case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+        case PIPE_CAP_QUERY_SO_OVERFLOW:
+        case PIPE_CAP_MEMOBJ:
+        case PIPE_CAP_LOAD_CONSTBUF:
+        case PIPE_CAP_TILE_RASTER_ORDER:
+        case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+        case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+        case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+        case PIPE_CAP_CONSTBUF0_FLAGS:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+        case PIPE_CAP_PACKED_UNIFORMS:
+                return 0;
+
+                /* Geometry shader output, unsupported. */
+        case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+        case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+                return 0;
+
+                /* Texturing. */
+        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+        case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+        case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+                return VC5_MAX_MIP_LEVELS;
+        case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+                return 2048;
+
+                /* Render targets. */
+        case PIPE_CAP_MAX_RENDER_TARGETS:
+                return 4;
+
+                /* Queries. */
+        case PIPE_CAP_QUERY_TIME_ELAPSED:
+        case PIPE_CAP_QUERY_TIMESTAMP:
+                return 0;
+
+        case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+                return 2048;
+
+        case PIPE_CAP_ENDIANNESS:
+                return PIPE_ENDIAN_LITTLE;
+
+        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+                return 64;
+
+        case PIPE_CAP_VENDOR_ID:
+                return 0x14E4;
+        case PIPE_CAP_DEVICE_ID:
+                return 0xFFFFFFFF;
+        case PIPE_CAP_ACCELERATED:
+                return 1;
+        case PIPE_CAP_VIDEO_MEMORY: {
+                uint64_t system_memory;
+
+                if (!os_get_total_physical_memory(&system_memory))
+                        return 0;
+
+                return (int)(system_memory >> 20);
+        }
+        case PIPE_CAP_UMA:
+                return 1;
+
+        default:
+                fprintf(stderr, "unknown param %d\n", param);
+                return 0;
+        }
+}
+
+static float
+vc5_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+        switch (param) {
+        case PIPE_CAPF_MAX_LINE_WIDTH:
+        case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+                return 32;
+
+        case PIPE_CAPF_MAX_POINT_WIDTH:
+        case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+                return 512.0f;
+
+        case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+                return 0.0f;
+        case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+                return 16.0f;
+
+        case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+                return 0.0f;
+        default:
+                fprintf(stderr, "unknown paramf %d\n", param);
+                return 0;
+        }
+}
+
+static int
+vc5_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
+                           enum pipe_shader_cap param)
+{
+        if (shader != PIPE_SHADER_VERTEX &&
+            shader != PIPE_SHADER_FRAGMENT) {
+                return 0;
+        }
+
+        /* this is probably not totally correct.. but it's a start: */
+        switch (param) {
+        case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+        case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+        case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+        case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+                return 16384;
+
+        case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+                return UINT_MAX;
+
+        case PIPE_SHADER_CAP_MAX_INPUTS:
+                if (shader == PIPE_SHADER_FRAGMENT)
+                        return VC5_MAX_FS_INPUTS / 4;
+                else
+                        return 16;
+        case PIPE_SHADER_CAP_MAX_OUTPUTS:
+                if (shader == PIPE_SHADER_FRAGMENT)
+                        return 4;
+                else
+                        return VC5_MAX_FS_INPUTS / 4;
+        case PIPE_SHADER_CAP_MAX_TEMPS:
+                return 256; /* GL_MAX_PROGRAM_TEMPORARIES_ARB */
+        case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+                return 16 * 1024 * sizeof(float);
+        case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+                return 16;
+        case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+                return 0;
+        case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+        case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+        case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+                return 0;
+        case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+                return 1;
+        case PIPE_SHADER_CAP_SUBROUTINES:
+                return 0;
+        case PIPE_SHADER_CAP_INTEGERS:
+                return 1;
+        case PIPE_SHADER_CAP_FP16:
+        case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+        case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+        case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+                return 0;
+        case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+        case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+        case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return VC5_MAX_TEXTURE_SAMPLERS;
+        case PIPE_SHADER_CAP_PREFERRED_IR:
+                return PIPE_SHADER_IR_NIR;
+        case PIPE_SHADER_CAP_SUPPORTED_IRS:
+                return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+                return 32;
+        case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+        case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+                return 0;
+        default:
+                fprintf(stderr, "unknown shader param %d\n", param);
+                return 0;
+        }
+        return 0;
+}
+
+static boolean
+vc5_screen_is_format_supported(struct pipe_screen *pscreen,
+                               enum pipe_format format,
+                               enum pipe_texture_target target,
+                               unsigned sample_count,
+                               unsigned usage)
+{
+        struct vc5_screen *screen = vc5_screen(pscreen);
+
+        if (sample_count > 1 && sample_count != VC5_MAX_SAMPLES)
+                return FALSE;
+
+        if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
+            !util_format_is_supported(format, usage)) {
+                return FALSE;
+        }
+
+        if (usage & PIPE_BIND_VERTEX_BUFFER) {
+                switch (format) {
+                case PIPE_FORMAT_R32G32B32A32_FLOAT:
+                case PIPE_FORMAT_R32G32B32_FLOAT:
+                case PIPE_FORMAT_R32G32_FLOAT:
+                case PIPE_FORMAT_R32_FLOAT:
+                case PIPE_FORMAT_R32G32B32A32_SNORM:
+                case PIPE_FORMAT_R32G32B32_SNORM:
+                case PIPE_FORMAT_R32G32_SNORM:
+                case PIPE_FORMAT_R32_SNORM:
+                case PIPE_FORMAT_R32G32B32A32_SSCALED:
+                case PIPE_FORMAT_R32G32B32_SSCALED:
+                case PIPE_FORMAT_R32G32_SSCALED:
+                case PIPE_FORMAT_R32_SSCALED:
+                case PIPE_FORMAT_R16G16B16A16_UNORM:
+                case PIPE_FORMAT_R16G16B16_UNORM:
+                case PIPE_FORMAT_R16G16_UNORM:
+                case PIPE_FORMAT_R16_UNORM:
+                case PIPE_FORMAT_R16G16B16A16_SNORM:
+                case PIPE_FORMAT_R16G16B16_SNORM:
+                case PIPE_FORMAT_R16G16_SNORM:
+                case PIPE_FORMAT_R16_SNORM:
+                case PIPE_FORMAT_R16G16B16A16_USCALED:
+                case PIPE_FORMAT_R16G16B16_USCALED:
+                case PIPE_FORMAT_R16G16_USCALED:
+                case PIPE_FORMAT_R16_USCALED:
+                case PIPE_FORMAT_R16G16B16A16_SSCALED:
+                case PIPE_FORMAT_R16G16B16_SSCALED:
+                case PIPE_FORMAT_R16G16_SSCALED:
+                case PIPE_FORMAT_R16_SSCALED:
+                case PIPE_FORMAT_R8G8B8A8_UNORM:
+                case PIPE_FORMAT_R8G8B8_UNORM:
+                case PIPE_FORMAT_R8G8_UNORM:
+                case PIPE_FORMAT_R8_UNORM:
+                case PIPE_FORMAT_R8G8B8A8_SNORM:
+                case PIPE_FORMAT_R8G8B8_SNORM:
+                case PIPE_FORMAT_R8G8_SNORM:
+                case PIPE_FORMAT_R8_SNORM:
+                case PIPE_FORMAT_R8G8B8A8_USCALED:
+                case PIPE_FORMAT_R8G8B8_USCALED:
+                case PIPE_FORMAT_R8G8_USCALED:
+                case PIPE_FORMAT_R8_USCALED:
+                case PIPE_FORMAT_R8G8B8A8_SSCALED:
+                case PIPE_FORMAT_R8G8B8_SSCALED:
+                case PIPE_FORMAT_R8G8_SSCALED:
+                case PIPE_FORMAT_R8_SSCALED:
+                case PIPE_FORMAT_R10G10B10A2_UNORM:
+                case PIPE_FORMAT_B10G10R10A2_UNORM:
+                case PIPE_FORMAT_R10G10B10A2_SNORM:
+                case PIPE_FORMAT_B10G10R10A2_SNORM:
+                case PIPE_FORMAT_R10G10B10A2_USCALED:
+                case PIPE_FORMAT_B10G10R10A2_USCALED:
+                case PIPE_FORMAT_R10G10B10A2_SSCALED:
+                case PIPE_FORMAT_B10G10R10A2_SSCALED:
+                        break;
+                default:
+                        return FALSE;
+                }
+        }
+
+        if ((usage & PIPE_BIND_RENDER_TARGET) &&
+            !vc5_rt_format_supported(&screen->devinfo, format)) {
+                return FALSE;
+        }
+
+        if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+            !vc5_tex_format_supported(&screen->devinfo, format)) {
+                return FALSE;
+        }
+
+        if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
+            !(format == PIPE_FORMAT_S8_UINT_Z24_UNORM ||
+              format == PIPE_FORMAT_X8Z24_UNORM ||
+              format == PIPE_FORMAT_Z16_UNORM ||
+              format == PIPE_FORMAT_Z32_FLOAT ||
+              format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
+                return FALSE;
+        }
+
+        if ((usage & PIPE_BIND_INDEX_BUFFER) &&
+            !(format == PIPE_FORMAT_I8_UINT ||
+              format == PIPE_FORMAT_I16_UINT ||
+              format == PIPE_FORMAT_I32_UINT)) {
+                return FALSE;
+        }
+
+        return TRUE;
+}
+
+#define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x)))
+
+static unsigned handle_hash(void *key)
+{
+    return PTR_TO_UINT(key);
+}
+
+static int handle_compare(void *key1, void *key2)
+{
+    return PTR_TO_UINT(key1) != PTR_TO_UINT(key2);
+}
+
+static bool
+vc5_get_device_info(struct vc5_screen *screen)
+{
+        struct drm_v3d_get_param ident0 = {
+                .param = DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+        };
+        struct drm_v3d_get_param ident1 = {
+                .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+        };
+        int ret;
+
+        ret = vc5_ioctl(screen->fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
+        if (ret != 0) {
+                fprintf(stderr, "Couldn't get V3D core IDENT0: %s\n",
+                        strerror(errno));
+                return false;
+        }
+        ret = vc5_ioctl(screen->fd, DRM_IOCTL_V3D_GET_PARAM, &ident1);
+        if (ret != 0) {
+                fprintf(stderr, "Couldn't get V3D core IDENT1: %s\n",
+                        strerror(errno));
+                return false;
+        }
+
+        uint32_t major = (ident0.value >> 24) & 0xff;
+        uint32_t minor = (ident1.value >> 0) & 0xf;
+        screen->devinfo.ver = major * 10 + minor;
+
+        switch (screen->devinfo.ver) {
+        case 33:
+        case 41:
+        case 42:
+                break;
+        default:
+                fprintf(stderr,
+                        "V3D %d.%d not supported by this version of Mesa.\n",
+                        screen->devinfo.ver / 10,
+                        screen->devinfo.ver % 10);
+                return false;
+        }
+
+        return true;
+}
+
+static const void *
+vc5_screen_get_compiler_options(struct pipe_screen *pscreen,
+                                enum pipe_shader_ir ir, unsigned shader)
+{
+        return &v3d_nir_options;
+}
+
+struct pipe_screen *
+v3d_screen_create(int fd)
+{
+        struct vc5_screen *screen = rzalloc(NULL, struct vc5_screen);
+        struct pipe_screen *pscreen;
+
+        pscreen = &screen->base;
+
+        pscreen->destroy = vc5_screen_destroy;
+        pscreen->get_param = vc5_screen_get_param;
+        pscreen->get_paramf = vc5_screen_get_paramf;
+        pscreen->get_shader_param = vc5_screen_get_shader_param;
+        pscreen->context_create = vc5_context_create;
+        pscreen->is_format_supported = vc5_screen_is_format_supported;
+
+        screen->fd = fd;
+        list_inithead(&screen->bo_cache.time_list);
+        (void)mtx_init(&screen->bo_handles_mutex, mtx_plain);
+        screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
+
+#if defined(USE_V3D_SIMULATOR)
+        vc5_simulator_init(screen);
+#endif
+
+        if (!vc5_get_device_info(screen))
+                goto fail;
+
+        slab_create_parent(&screen->transfer_pool, sizeof(struct vc5_transfer), 16);
+
+        vc5_fence_init(screen);
+
+        v3d_process_debug_variable();
+
+        vc5_resource_screen_init(pscreen);
+
+        screen->compiler = v3d_compiler_init(&screen->devinfo);
+
+        pscreen->get_name = vc5_screen_get_name;
+        pscreen->get_vendor = vc5_screen_get_vendor;
+        pscreen->get_device_vendor = vc5_screen_get_vendor;
+        pscreen->get_compiler_options = vc5_screen_get_compiler_options;
+
+        return pscreen;
+
+fail:
+        close(fd);
+        ralloc_free(pscreen);
+        return NULL;
+}
diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
new file mode 100644
index 00000000000..975bfe01a75
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_screen.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_SCREEN_H
+#define VC5_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "os/os_thread.h"
+#include "state_tracker/drm_driver.h"
+#include "util/list.h"
+#include "util/slab.h"
+#include "broadcom/common/v3d_debug.h"
+#include "broadcom/common/v3d_device_info.h"
+
+struct vc5_bo;
+
+#define VC5_MAX_MIP_LEVELS 12
+#define VC5_MAX_TEXTURE_SAMPLERS 32
+#define VC5_MAX_SAMPLES 4
+#define VC5_MAX_DRAW_BUFFERS 4
+#define VC5_MAX_ATTRIBUTES 16
+
+/* These are tunable parameters in the HW design, but all the V3D
+ * implementations agree.
+ */
+#define VC5_UIFCFG_BANKS 8
+#define VC5_UIFCFG_PAGE_SIZE 4096
+#define VC5_UIFCFG_XOR_VALUE (1 << 4)
+#define VC5_PAGE_CACHE_SIZE (VC5_UIFCFG_PAGE_SIZE * VC5_UIFCFG_BANKS)
+#define VC5_UBLOCK_SIZE 64
+#define VC5_UIFBLOCK_SIZE (4 * VC5_UBLOCK_SIZE)
+#define VC5_UIFBLOCK_ROW_SIZE (4 * VC5_UIFBLOCK_SIZE)
+
+struct vc5_simulator_file;
+
+struct vc5_screen {
+        struct pipe_screen base;
+        int fd;
+
+        struct v3d_device_info devinfo;
+
+        const char *name;
+
+        struct slab_parent_pool transfer_pool;
+
+        struct vc5_bo_cache {
+                /** List of struct vc5_bo freed, by age. */
+                struct list_head time_list;
+                /** List of struct vc5_bo freed, per size, by age. */
+                struct list_head *size_list;
+                uint32_t size_list_size;
+
+                mtx_t lock;
+
+                uint32_t bo_size;
+                uint32_t bo_count;
+        } bo_cache;
+
+        const struct v3d_compiler *compiler;
+
+        struct util_hash_table *bo_handles;
+        mtx_t bo_handles_mutex;
+
+        uint32_t bo_size;
+        uint32_t bo_count;
+
+        struct vc5_simulator_file *sim_file;
+};
+
+static inline struct vc5_screen *
+vc5_screen(struct pipe_screen *screen)
+{
+        return (struct vc5_screen *)screen;
+}
+
+struct pipe_screen *v3d_screen_create(int fd);
+
+void
+vc5_fence_init(struct vc5_screen *screen);
+
+#endif /* VC5_SCREEN_H */
diff --git a/src/gallium/drivers/v3d/v3d_simulator.c b/src/gallium/drivers/v3d/v3d_simulator.c
new file mode 100644
index 00000000000..86e4ed3be3d
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_simulator.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc5_simulator.c
+ *
+ * Implements VC5 simulation on top of a non-VC5 GEM fd.
+ *
+ * This file's goal is to emulate the VC5 ioctls' behavior in the kernel on
+ * top of the simpenrose software simulator.  Generally, VC5 driver BOs have a
+ * GEM-side copy of their contents and a simulator-side memory area that the
+ * GEM contents get copied into during simulation.  Once simulation is done,
+ * the simulator's data is copied back out to the GEM BOs, so that rendering
+ * appears on the screen as if actual hardware rendering had been done.
+ *
+ * One of the limitations of this code is that we shouldn't really need a
+ * GEM-side BO for non-window-system BOs.  However, do we need unique BO
+ * handles for each of our GEM bos so that this file can look up its state
+ * from the handle passed in at submit ioctl time (also, a couple of places
+ * outside of this file still call ioctls directly on the fd).
+ *
+ * Another limitation is that BO import doesn't work unless the underlying
+ * window system's BO size matches what VC5 is going to use, which of course
+ * doesn't work out in practice.  This means that for now, only DRI3 (VC5
+ * makes the winsys BOs) is supported, not DRI2 (window system makes the winys
+ * BOs).
+ */
+
+#ifdef USE_V3D_SIMULATOR
+
+#include <sys/mman.h>
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/set.h"
+#include "util/u_memory.h"
+#include "util/u_mm.h"
+#include "v3d_simulator_wrapper.h"
+
+#include "v3d_screen.h"
+#include "v3d_context.h"
+
+/** Global (across GEM fds) state for the simulator */
+static struct vc5_simulator_state {
+        mtx_t mutex;
+
+        struct v3d_hw *v3d;
+        int ver;
+
+        /* Base virtual address of the heap. */
+        void *mem;
+        /* Base hardware address of the heap. */
+        uint32_t mem_base;
+        /* Size of the heap. */
+        size_t mem_size;
+
+        struct mem_block *heap;
+        struct mem_block *overflow;
+
+        /** Mapping from GEM handle to struct vc5_simulator_bo * */
+        struct hash_table *fd_map;
+
+        int refcount;
+} sim_state = {
+        .mutex = _MTX_INITIALIZER_NP,
+};
+
+/** Per-GEM-fd state for the simulator. */
+struct vc5_simulator_file {
+        int fd;
+
+        /** Mapping from GEM handle to struct vc5_simulator_bo * */
+        struct hash_table *bo_map;
+
+        struct mem_block *gmp;
+        void *gmp_vaddr;
+};
+
+/** Wrapper for drm_vc5_bo tracking the simulator-specific state. */
+struct vc5_simulator_bo {
+        struct vc5_simulator_file *file;
+
+        /** Area for this BO within sim_state->mem */
+        struct mem_block *block;
+        uint32_t size;
+        void *vaddr;
+
+        void *winsys_map;
+        uint32_t winsys_stride;
+
+        int handle;
+};
+
+static void *
+int_to_key(int key)
+{
+        return (void *)(uintptr_t)key;
+}
+
+static struct vc5_simulator_file *
+vc5_get_simulator_file_for_fd(int fd)
+{
+        struct hash_entry *entry = _mesa_hash_table_search(sim_state.fd_map,
+                                                           int_to_key(fd + 1));
+        return entry ? entry->data : NULL;
+}
+
+/* A marker placed just after each BO, then checked after rendering to make
+ * sure it's still there.
+ */
+#define BO_SENTINEL		0xfedcba98
+
+/* 128kb */
+#define GMP_ALIGN2		17
+
+/**
+ * Sets the range of GPU virtual address space to have the given GMP
+ * permissions (bit 0 = read, bit 1 = write, write-only forbidden).
+ */
+static void
+set_gmp_flags(struct vc5_simulator_file *file,
+              uint32_t offset, uint32_t size, uint32_t flag)
+{
+        assert((offset & ((1 << GMP_ALIGN2) - 1)) == 0);
+        int gmp_offset = offset >> GMP_ALIGN2;
+        int gmp_count = align(size, 1 << GMP_ALIGN2) >> GMP_ALIGN2;
+        uint32_t *gmp = file->gmp_vaddr;
+
+        assert(flag <= 0x3);
+
+        for (int i = gmp_offset; i < gmp_offset + gmp_count; i++) {
+                int32_t bitshift = (i % 16) * 2;
+                gmp[i / 16] &= ~(0x3 << bitshift);
+                gmp[i / 16] |= flag << bitshift;
+        }
+}
+
+/**
+ * Allocates space in simulator memory and returns a tracking struct for it
+ * that also contains the drm_gem_cma_object struct.
+ */
+static struct vc5_simulator_bo *
+vc5_create_simulator_bo(int fd, int handle, unsigned size)
+{
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct vc5_simulator_bo *sim_bo = rzalloc(file,
+                                                  struct vc5_simulator_bo);
+        size = align(size, 4096);
+
+        sim_bo->file = file;
+        sim_bo->handle = handle;
+
+        mtx_lock(&sim_state.mutex);
+        sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, GMP_ALIGN2, 0);
+        mtx_unlock(&sim_state.mutex);
+        assert(sim_bo->block);
+
+        set_gmp_flags(file, sim_bo->block->ofs, size, 0x3);
+
+        sim_bo->size = size;
+        sim_bo->vaddr = sim_state.mem + sim_bo->block->ofs - sim_state.mem_base;
+        memset(sim_bo->vaddr, 0xd0, size);
+
+        *(uint32_t *)(sim_bo->vaddr + sim_bo->size) = BO_SENTINEL;
+
+        /* A handle of 0 is used for vc5_gem.c internal allocations that
+         * don't need to go in the lookup table.
+         */
+        if (handle != 0) {
+                mtx_lock(&sim_state.mutex);
+                _mesa_hash_table_insert(file->bo_map, int_to_key(handle),
+                                        sim_bo);
+                mtx_unlock(&sim_state.mutex);
+        }
+
+        return sim_bo;
+}
+
+static void
+vc5_free_simulator_bo(struct vc5_simulator_bo *sim_bo)
+{
+        struct vc5_simulator_file *sim_file = sim_bo->file;
+
+        if (sim_bo->winsys_map)
+                munmap(sim_bo->winsys_map, sim_bo->size);
+
+        set_gmp_flags(sim_file, sim_bo->block->ofs, sim_bo->size, 0x0);
+
+        mtx_lock(&sim_state.mutex);
+        u_mmFreeMem(sim_bo->block);
+        if (sim_bo->handle) {
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(sim_file->bo_map,
+                                                int_to_key(sim_bo->handle));
+                _mesa_hash_table_remove(sim_file->bo_map, entry);
+        }
+        mtx_unlock(&sim_state.mutex);
+        ralloc_free(sim_bo);
+}
+
+static struct vc5_simulator_bo *
+vc5_get_simulator_bo(struct vc5_simulator_file *file, int gem_handle)
+{
+        mtx_lock(&sim_state.mutex);
+        struct hash_entry *entry =
+                _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle));
+        mtx_unlock(&sim_state.mutex);
+
+        return entry ? entry->data : NULL;
+}
+
+static int
+vc5_simulator_pin_bos(int fd, struct vc5_job *job)
+{
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct set_entry *entry;
+
+        set_foreach(job->bos, entry) {
+                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
+                struct vc5_simulator_bo *sim_bo =
+                        vc5_get_simulator_bo(file, bo->handle);
+
+                vc5_bo_map(bo);
+                memcpy(sim_bo->vaddr, bo->map, bo->size);
+        }
+
+        return 0;
+}
+
+static int
+vc5_simulator_unpin_bos(int fd, struct vc5_job *job)
+{
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct set_entry *entry;
+
+        set_foreach(job->bos, entry) {
+                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
+                struct vc5_simulator_bo *sim_bo =
+                        vc5_get_simulator_bo(file, bo->handle);
+
+                if (*(uint32_t *)(sim_bo->vaddr +
+                                  sim_bo->size) != BO_SENTINEL) {
+                        fprintf(stderr, "Buffer overflow in %s\n", bo->name);
+                }
+
+                vc5_bo_map(bo);
+                memcpy(bo->map, sim_bo->vaddr, bo->size);
+        }
+
+        return 0;
+}
+
+#if 0
+static void
+vc5_dump_to_file(struct vc5_exec_info *exec)
+{
+        static int dumpno = 0;
+        struct drm_vc5_get_hang_state *state;
+        struct drm_vc5_get_hang_state_bo *bo_state;
+        unsigned int dump_version = 0;
+
+        if (!(vc5_debug & VC5_DEBUG_DUMP))
+                return;
+
+        state = calloc(1, sizeof(*state));
+
+        int unref_count = 0;
+        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                unref_count++;
+        }
+
+        /* Add one more for the overflow area that isn't wrapped in a BO. */
+        state->bo_count = exec->bo_count + unref_count + 1;
+        bo_state = calloc(state->bo_count, sizeof(*bo_state));
+
+        char *filename = NULL;
+        asprintf(&filename, "vc5-dri-%d.dump", dumpno++);
+        FILE *f = fopen(filename, "w+");
+        if (!f) {
+                fprintf(stderr, "Couldn't open %s: %s", filename,
+                        strerror(errno));
+                return;
+        }
+
+        fwrite(&dump_version, sizeof(dump_version), 1, f);
+
+        state->ct0ca = exec->ct0ca;
+        state->ct0ea = exec->ct0ea;
+        state->ct1ca = exec->ct1ca;
+        state->ct1ea = exec->ct1ea;
+        state->start_bin = exec->ct0ca;
+        state->start_render = exec->ct1ca;
+        fwrite(state, sizeof(*state), 1, f);
+
+        int i;
+        for (i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                bo_state[i].handle = i; /* Not used by the parser. */
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+        }
+
+        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                bo_state[i].handle = 0;
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+                i++;
+        }
+
+        /* Add the static overflow memory area. */
+        bo_state[i].handle = exec->bo_count;
+        bo_state[i].paddr = sim_state.overflow->ofs;
+        bo_state[i].size = sim_state.overflow->size;
+        i++;
+
+        fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
+
+        for (int i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        void *overflow = calloc(1, sim_state.overflow->size);
+        fwrite(overflow, 1, sim_state.overflow->size, f);
+        free(overflow);
+
+        free(state);
+        free(bo_state);
+        fclose(f);
+}
+#endif
+
+int
+vc5_simulator_flush(struct vc5_context *vc5,
+                    struct drm_v3d_submit_cl *submit, struct vc5_job *job)
+{
+        struct vc5_screen *screen = vc5->screen;
+        int fd = screen->fd;
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct vc5_surface *csurf = vc5_surface(vc5->framebuffer.cbufs[0]);
+        struct vc5_resource *ctex = csurf ? vc5_resource(csurf->base.texture) : NULL;
+        struct vc5_simulator_bo *csim_bo = ctex ? vc5_get_simulator_bo(file, ctex->bo->handle) : NULL;
+        uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0;
+        uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
+        uint32_t row_len = MIN2(sim_stride, winsys_stride);
+        int ret;
+
+        if (ctex && csim_bo->winsys_map) {
+#if 0
+                fprintf(stderr, "%dx%d %d %d %d\n",
+                        ctex->base.b.width0, ctex->base.b.height0,
+                        winsys_stride,
+                        sim_stride,
+                        ctex->bo->size);
+#endif
+
+                for (int y = 0; y < ctex->base.height0; y++) {
+                        memcpy(ctex->bo->map + y * sim_stride,
+                               csim_bo->winsys_map + y * winsys_stride,
+                               row_len);
+                }
+        }
+
+        ret = vc5_simulator_pin_bos(fd, job);
+        if (ret)
+                return ret;
+
+        //vc5_dump_to_file(&exec);
+
+        if (sim_state.ver >= 41)
+                v3d41_simulator_flush(sim_state.v3d, submit, file->gmp->ofs);
+        else
+                v3d33_simulator_flush(sim_state.v3d, submit, file->gmp->ofs);
+
+        ret = vc5_simulator_unpin_bos(fd, job);
+        if (ret)
+                return ret;
+
+        if (ctex && csim_bo->winsys_map) {
+                for (int y = 0; y < ctex->base.height0; y++) {
+                        memcpy(csim_bo->winsys_map + y * winsys_stride,
+                               ctex->bo->map + y * sim_stride,
+                               row_len);
+                }
+        }
+
+        return 0;
+}
+
+/**
+ * Map the underlying GEM object from the real hardware GEM handle.
+ */
+static void *
+vc5_simulator_map_winsys_bo(int fd, struct vc5_simulator_bo *sim_bo)
+{
+        int ret;
+        void *map;
+
+        struct drm_mode_map_dumb map_dumb = {
+                .handle = sim_bo->handle,
+        };
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
+        if (ret != 0) {
+                fprintf(stderr, "map ioctl failure\n");
+                abort();
+        }
+
+        map = mmap(NULL, sim_bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                   fd, map_dumb.offset);
+        if (map == MAP_FAILED) {
+                fprintf(stderr,
+                        "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
+                        sim_bo->handle, (long long)map_dumb.offset,
+                        (int)sim_bo->size);
+                abort();
+        }
+
+        return map;
+}
+
+/**
+ * Do fixups after a BO has been opened from a handle.
+ *
+ * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE
+ * time, but we're still using drmPrimeFDToHandle() so we have this helper to
+ * be called afterward instead.
+ */
+void vc5_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+                                    int handle, uint32_t size)
+{
+        struct vc5_simulator_bo *sim_bo =
+                vc5_create_simulator_bo(fd, handle, size);
+
+        sim_bo->winsys_stride = winsys_stride;
+        sim_bo->winsys_map = vc5_simulator_map_winsys_bo(fd, sim_bo);
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC5_CREATE_BO) implementation.
+ *
+ * Making a VC5 BO is just a matter of making a corresponding BO on the host.
+ */
+static int
+vc5_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
+{
+        int ret;
+        struct drm_mode_create_dumb create = {
+                .width = 128,
+                .bpp = 8,
+                .height = (args->size + 127) / 128,
+        };
+
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+        assert(create.size >= args->size);
+
+        args->handle = create.handle;
+
+        struct vc5_simulator_bo *sim_bo =
+                vc5_create_simulator_bo(fd, create.handle, args->size);
+
+        args->offset = sim_bo->block->ofs;
+
+        return ret;
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC5_MMAP_BO) implementation.
+ *
+ * We just pass this straight through to dumb mmap.
+ */
+static int
+vc5_simulator_mmap_bo_ioctl(int fd, struct drm_v3d_mmap_bo *args)
+{
+        int ret;
+        struct drm_mode_map_dumb map = {
+                .handle = args->handle,
+        };
+
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+        args->offset = map.offset;
+
+        return ret;
+}
+
+static int
+vc5_simulator_get_bo_offset_ioctl(int fd, struct drm_v3d_get_bo_offset *args)
+{
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct vc5_simulator_bo *sim_bo = vc5_get_simulator_bo(file,
+                                                               args->handle);
+
+        args->offset = sim_bo->block->ofs;
+
+        return 0;
+}
+
+static int
+vc5_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+{
+        /* Free the simulator's internal tracking. */
+        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct vc5_simulator_bo *sim_bo = vc5_get_simulator_bo(file,
+                                                               args->handle);
+
+        vc5_free_simulator_bo(sim_bo);
+
+        /* Pass the call on down. */
+        return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
+}
+
+static int
+vc5_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
+{
+        if (sim_state.ver >= 41)
+                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
+        else
+                return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
+}
+
+int
+vc5_simulator_ioctl(int fd, unsigned long request, void *args)
+{
+        switch (request) {
+        case DRM_IOCTL_V3D_CREATE_BO:
+                return vc5_simulator_create_bo_ioctl(fd, args);
+        case DRM_IOCTL_V3D_MMAP_BO:
+                return vc5_simulator_mmap_bo_ioctl(fd, args);
+        case DRM_IOCTL_V3D_GET_BO_OFFSET:
+                return vc5_simulator_get_bo_offset_ioctl(fd, args);
+
+        case DRM_IOCTL_V3D_WAIT_BO:
+                /* We do all of the vc5 rendering synchronously, so we just
+                 * return immediately on the wait ioctls.  This ignores any
+                 * native rendering to the host BO, so it does mean we race on
+                 * front buffer rendering.
+                 */
+                return 0;
+
+        case DRM_IOCTL_V3D_GET_PARAM:
+                return vc5_simulator_get_param_ioctl(fd, args);
+
+        case DRM_IOCTL_GEM_CLOSE:
+                return vc5_simulator_gem_close_ioctl(fd, args);
+
+        case DRM_IOCTL_GEM_OPEN:
+        case DRM_IOCTL_GEM_FLINK:
+                return drmIoctl(fd, request, args);
+        default:
+                fprintf(stderr, "Unknown ioctl 0x%08x\n", (int)request);
+                abort();
+        }
+}
+
+static void
+vc5_simulator_init_global(const struct v3d_device_info *devinfo)
+{
+        mtx_lock(&sim_state.mutex);
+        if (sim_state.refcount++) {
+                mtx_unlock(&sim_state.mutex);
+                return;
+        }
+
+        sim_state.v3d = v3d_hw_auto_new(NULL);
+        v3d_hw_alloc_mem(sim_state.v3d, 1024 * 1024 * 1024);
+        sim_state.mem_base =
+                v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size,
+                               &sim_state.mem);
+
+        /* Allocate from anywhere from 4096 up.  We don't allocate at 0,
+         * because for OQs and some other addresses in the HW, 0 means
+         * disabled.
+         */
+        sim_state.heap = u_mmInit(4096, sim_state.mem_size - 4096);
+
+        /* Make a block of 0xd0 at address 0 to make sure we don't screw up
+         * and land there.
+         */
+        struct mem_block *b = u_mmAllocMem(sim_state.heap, 4096, GMP_ALIGN2, 0);
+        memset(sim_state.mem + b->ofs - sim_state.mem_base, 0xd0, 4096);
+
+        sim_state.ver = v3d_hw_get_version(sim_state.v3d);
+
+        mtx_unlock(&sim_state.mutex);
+
+        sim_state.fd_map =
+                _mesa_hash_table_create(NULL,
+                                        _mesa_hash_pointer,
+                                        _mesa_key_pointer_equal);
+
+        if (sim_state.ver >= 41)
+                v3d41_simulator_init_regs(sim_state.v3d);
+        else
+                v3d33_simulator_init_regs(sim_state.v3d);
+}
+
+void
+vc5_simulator_init(struct vc5_screen *screen)
+{
+        vc5_simulator_init_global(&screen->devinfo);
+
+        screen->sim_file = rzalloc(screen, struct vc5_simulator_file);
+        struct vc5_simulator_file *sim_file = screen->sim_file;
+
+        screen->sim_file->bo_map =
+                _mesa_hash_table_create(screen->sim_file,
+                                        _mesa_hash_pointer,
+                                        _mesa_key_pointer_equal);
+
+        mtx_lock(&sim_state.mutex);
+        _mesa_hash_table_insert(sim_state.fd_map, int_to_key(screen->fd + 1),
+                                screen->sim_file);
+        mtx_unlock(&sim_state.mutex);
+
+        sim_file->gmp = u_mmAllocMem(sim_state.heap, 8096, GMP_ALIGN2, 0);
+        sim_file->gmp_vaddr = (sim_state.mem + sim_file->gmp->ofs -
+                               sim_state.mem_base);
+}
+
+void
+vc5_simulator_destroy(struct vc5_screen *screen)
+{
+        mtx_lock(&sim_state.mutex);
+        if (!--sim_state.refcount) {
+                _mesa_hash_table_destroy(sim_state.fd_map, NULL);
+                u_mmDestroy(sim_state.heap);
+                /* No memsetting the struct, because it contains the mutex. */
+                sim_state.mem = NULL;
+        }
+        mtx_unlock(&sim_state.mutex);
+}
+
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp b/src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp
new file mode 100644
index 00000000000..7b04ded2b53
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file
+ *
+ * Wraps bits of the V3D simulator interface in a C interface for the
+ * v3d_simulator.c code to use.
+ */
+
+#ifdef USE_V3D_SIMULATOR
+
+#include "v3d_simulator_wrapper.h"
+
+#define V3D_TECH_VERSION 3
+#define V3D_REVISION 3
+#define V3D_SUB_REV 0
+#define V3D_HIDDEN_REV 0
+#define V3D_COMPAT_REV 0
+#include "v3d_hw_auto.h"
+
+extern "C" {
+
+struct v3d_hw *v3d_hw_auto_new(void *in_params)
+{
+        return v3d_hw_auto_make_unique().release();
+}
+
+
+uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, size_t *size, void **p)
+{
+        return hw->get_mem(size, p);
+}
+
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size)
+{
+        return hw->alloc_mem(min_size) == V3D_HW_ALLOC_SUCCESS;
+}
+
+bool v3d_hw_has_gca(struct v3d_hw *hw)
+{
+        return hw->has_gca();
+}
+
+uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg)
+{
+        return hw->read_reg(reg);
+}
+
+void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val)
+{
+        hw->write_reg(reg, val);
+}
+
+void v3d_hw_tick(struct v3d_hw *hw)
+{
+        return hw->tick();
+}
+
+int v3d_hw_get_version(struct v3d_hw *hw)
+{
+        const V3D_HUB_IDENT_T *ident = hw->get_hub_ident();
+
+        return ident->tech_version * 10 + ident->revision;
+}
+
+}
+
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/v3d/v3d_simulator_wrapper.h b/src/gallium/drivers/v3d/v3d_simulator_wrapper.h
new file mode 100644
index 00000000000..8b5dca15ed9
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_simulator_wrapper.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+struct v3d_hw;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct v3d_hw *v3d_hw_auto_new(void *params);
+uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, size_t *size, void **p);
+bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size);
+bool v3d_hw_has_gca(struct v3d_hw *hw);
+uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg);
+void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val);
+void v3d_hw_tick(struct v3d_hw *hw);
+int v3d_hw_get_version(struct v3d_hw *hw);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gallium/drivers/v3d/v3d_tiling.c b/src/gallium/drivers/v3d/v3d_tiling.c
new file mode 100644
index 00000000000..f9c4a342184
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_tiling.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file vc5_tiling.c
+ *
+ * Handles information about the VC5 tiling formats, and loading and storing
+ * from them.
+ */
+
+#include <stdint.h>
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_tiling.h"
+
+/** Return the width in pixels of a 64-byte microtile. */
+uint32_t
+vc5_utile_width(int cpp)
+{
+        switch (cpp) {
+        case 1:
+        case 2:
+                return 8;
+        case 4:
+        case 8:
+                return 4;
+        case 16:
+                return 2;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+uint32_t
+vc5_utile_height(int cpp)
+{
+        switch (cpp) {
+        case 1:
+                return 8;
+        case 2:
+        case 4:
+                return 4;
+        case 8:
+        case 16:
+                return 2;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
+/**
+ * Returns the byte address for a given pixel within a utile.
+ *
+ * Utiles are 64b blocks of pixels in raster order, with 32bpp being a 4x4
+ * arrangement.
+ */
+static inline uint32_t
+vc5_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
+{
+        uint32_t utile_w = vc5_utile_width(cpp);
+        uint32_t utile_h = vc5_utile_height(cpp);
+
+        assert(x < utile_w && y < utile_h);
+
+        return x * cpp + y * utile_w * cpp;
+}
+
+/**
+ * Returns the byte offset for a given pixel in a LINEARTILE layout.
+ *
+ * LINEARTILE is a single line of utiles in either the X or Y direction.
+ */
+static inline uint32_t
+vc5_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
+{
+        uint32_t utile_w = vc5_utile_width(cpp);
+        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t utile_index_x = x / utile_w;
+        uint32_t utile_index_y = y / utile_h;
+
+        assert(utile_index_x == 0 || utile_index_y == 0);
+
+        return (64 * (utile_index_x + utile_index_y) +
+                vc5_get_utile_pixel_offset(cpp,
+                                           x & (utile_w - 1),
+                                           y & (utile_h - 1)));
+}
+
+/**
+ * Returns the byte offset for a given pixel in a UBLINEAR layout.
+ *
+ * UBLINEAR is the layout where pixels are arranged in UIF blocks (2x2
+ * utiles), and the UIF blocks are in 1 or 2 columns in raster order.
+ */
+static inline uint32_t
+vc5_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
+                              int ublinear_number)
+{
+        uint32_t utile_w = vc5_utile_width(cpp);
+        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t ub_w = utile_w * 2;
+        uint32_t ub_h = utile_h * 2;
+        uint32_t ub_x = x / ub_w;
+        uint32_t ub_y = y / ub_h;
+
+        return (256 * (ub_y * ublinear_number +
+                       ub_x) +
+                ((x & utile_w) ? 64 : 0) +
+                ((y & utile_h) ? 128 : 0) +
+                + vc5_get_utile_pixel_offset(cpp,
+                                             x & (utile_w - 1),
+                                             y & (utile_h - 1)));
+}
+
+static inline uint32_t
+vc5_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                       uint32_t x, uint32_t y)
+{
+        return vc5_get_ublinear_pixel_offset(cpp, x, y, 2);
+}
+
+static inline uint32_t
+vc5_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                       uint32_t x, uint32_t y)
+{
+        return vc5_get_ublinear_pixel_offset(cpp, x, y, 1);
+}
+
+/**
+ * Returns the byte offset for a given pixel in a UIF layout.
+ *
+ * UIF is the general VC5 tiling layout shared across 3D, media, and scanout.
+ * It stores pixels in UIF blocks (2x2 utiles), and UIF blocks are stored in
+ * 4x4 groups, and those 4x4 groups are then stored in raster order.
+ */
+static inline uint32_t
+vc5_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
+                         bool do_xor)
+{
+        uint32_t utile_w = vc5_utile_width(cpp);
+        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t mb_width = utile_w * 2;
+        uint32_t mb_height = utile_h * 2;
+        uint32_t log2_mb_width = ffs(mb_width) - 1;
+        uint32_t log2_mb_height = ffs(mb_height) - 1;
+
+        /* Macroblock X, y */
+        uint32_t mb_x = x >> log2_mb_width;
+        uint32_t mb_y = y >> log2_mb_height;
+        /* X, y within the macroblock */
+        uint32_t mb_pixel_x = x - (mb_x << log2_mb_width);
+        uint32_t mb_pixel_y = y - (mb_y << log2_mb_height);
+
+        if (do_xor && (mb_x / 4) & 1)
+                mb_y ^= 0x10;
+
+        uint32_t mb_h = align(image_h, 1 << log2_mb_height) >> log2_mb_height;
+        uint32_t mb_id = ((mb_x / 4) * ((mb_h - 1) * 4)) + mb_x + mb_y * 4;
+
+        uint32_t mb_base_addr = mb_id * 256;
+
+        bool top = mb_pixel_y < utile_h;
+        bool left = mb_pixel_x < utile_w;
+
+        /* Docs have this in pixels, we do bytes here. */
+        uint32_t mb_tile_offset = (!top * 128 + !left * 64);
+
+        uint32_t utile_x = mb_pixel_x & (utile_w - 1);
+        uint32_t utile_y = mb_pixel_y & (utile_h - 1);
+
+        uint32_t mb_pixel_address = (mb_base_addr +
+                                     mb_tile_offset +
+                                     vc5_get_utile_pixel_offset(cpp,
+                                                                utile_x,
+                                                                utile_y));
+
+        return mb_pixel_address;
+}
+
+static inline uint32_t
+vc5_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+                             uint32_t x, uint32_t y)
+{
+        return vc5_get_uif_pixel_offset(cpp, image_h, x, y, true);
+}
+
+static inline uint32_t
+vc5_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                uint32_t x, uint32_t y)
+{
+        return vc5_get_uif_pixel_offset(cpp, image_h, x, y, false);
+}
+
+static inline void
+vc5_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
+                               void *cpu, uint32_t cpu_stride,
+                               int cpp, uint32_t image_h,
+                               const struct pipe_box *box,
+                               uint32_t (*get_pixel_offset)(uint32_t cpp,
+                                                            uint32_t image_h,
+                                                            uint32_t x, uint32_t y),
+                               bool is_load)
+{
+        for (uint32_t y = 0; y < box->height; y++) {
+                void *cpu_row = cpu + y * cpu_stride;
+
+                for (int x = 0; x < box->width; x++) {
+                        uint32_t pixel_offset = get_pixel_offset(cpp, image_h,
+                                                                 box->x + x,
+                                                                 box->y + y);
+
+                        if (false) {
+                                fprintf(stderr, "%3d,%3d -> %d\n",
+                                        box->x + x, box->y + y,
+                                        pixel_offset);
+                        }
+
+                        if (is_load) {
+                                memcpy(cpu_row + x * cpp,
+                                       gpu + pixel_offset,
+                                       cpp);
+                        } else {
+                                memcpy(gpu + pixel_offset,
+                                       cpu_row + x * cpp,
+                                       cpp);
+                        }
+                }
+        }
+}
+
+static inline void
+vc5_move_pixels_general(void *gpu, uint32_t gpu_stride,
+                               void *cpu, uint32_t cpu_stride,
+                               int cpp, uint32_t image_h,
+                               const struct pipe_box *box,
+                               uint32_t (*get_pixel_offset)(uint32_t cpp,
+                                                            uint32_t image_h,
+                                                            uint32_t x, uint32_t y),
+                               bool is_load)
+{
+        switch (cpp) {
+        case 1:
+                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               1, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 2:
+                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               2, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 4:
+                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               4, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 8:
+                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               8, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 16:
+                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               16, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        }
+}
+
+static inline void
+vc5_move_tiled_image(void *gpu, uint32_t gpu_stride,
+                     void *cpu, uint32_t cpu_stride,
+                     enum vc5_tiling_mode tiling_format,
+                     int cpp,
+                     uint32_t image_h,
+                     const struct pipe_box *box,
+                     bool is_load)
+{
+        switch (tiling_format) {
+        case VC5_TILING_UIF_XOR:
+                vc5_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        vc5_get_uif_xor_pixel_offset,
+                                        is_load);
+                break;
+        case VC5_TILING_UIF_NO_XOR:
+                vc5_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        vc5_get_uif_no_xor_pixel_offset,
+                                        is_load);
+                break;
+        case VC5_TILING_UBLINEAR_2_COLUMN:
+                vc5_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        vc5_get_ublinear_2_column_pixel_offset,
+                                        is_load);
+                break;
+        case VC5_TILING_UBLINEAR_1_COLUMN:
+                vc5_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        vc5_get_ublinear_1_column_pixel_offset,
+                                        is_load);
+                break;
+        case VC5_TILING_LINEARTILE:
+                vc5_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        vc5_get_lt_pixel_offset,
+                                        is_load);
+                break;
+        default:
+                unreachable("Unsupported tiling format");
+                break;
+        }
+}
+
+/**
+ * Loads pixel data from the start (microtile-aligned) box in \p src to the
+ * start of \p dst according to the given tiling format.
+ */
+void
+vc5_load_tiled_image(void *dst, uint32_t dst_stride,
+                     void *src, uint32_t src_stride,
+                     enum vc5_tiling_mode tiling_format, int cpp,
+                     uint32_t image_h,
+                     const struct pipe_box *box)
+{
+        vc5_move_tiled_image(src, src_stride,
+                             dst, dst_stride,
+                             tiling_format,
+                             cpp,
+                             image_h,
+                             box,
+                             true);
+}
+
+/**
+ * Stores pixel data from the start of \p src into a (microtile-aligned) box in
+ * \p dst according to the given tiling format.
+ */
+void
+vc5_store_tiled_image(void *dst, uint32_t dst_stride,
+                      void *src, uint32_t src_stride,
+                      enum vc5_tiling_mode tiling_format, int cpp,
+                      uint32_t image_h,
+                      const struct pipe_box *box)
+{
+        vc5_move_tiled_image(dst, dst_stride,
+                             src, src_stride,
+                             tiling_format,
+                             cpp,
+                             image_h,
+                             box,
+                             false);
+}
diff --git a/src/gallium/drivers/v3d/v3d_tiling.h b/src/gallium/drivers/v3d/v3d_tiling.h
new file mode 100644
index 00000000000..d3cf48c4527
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_tiling.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC5_TILING_H
+#define VC5_TILING_H
+
+uint32_t vc5_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t vc5_utile_height(int cpp) ATTRIBUTE_CONST;
+bool vc5_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
+void vc5_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
+void vc5_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
+void vc5_load_tiled_image(void *dst, uint32_t dst_stride,
+                          void *src, uint32_t src_stride,
+                          enum vc5_tiling_mode tiling_format, int cpp,
+                          uint32_t image_h,
+                          const struct pipe_box *box);
+void vc5_store_tiled_image(void *dst, uint32_t dst_stride,
+                           void *src, uint32_t src_stride,
+                           enum vc5_tiling_mode tiling_format, int cpp,
+                           uint32_t image_h,
+                           const struct pipe_box *box);
+
+#endif /* VC5_TILING_H */
diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
new file mode 100644
index 00000000000..c7a39b50a74
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -0,0 +1,489 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_pack_color.h"
+#include "util/format_srgb.h"
+
+#include "v3d_context.h"
+#include "compiler/v3d_compiler.h"
+#include "broadcom/cle/v3d_packet_v33_pack.h"
+
+#if 0
+
+#define SWIZ(x,y,z,w) {          \
+        PIPE_SWIZZLE_##x, \
+        PIPE_SWIZZLE_##y, \
+        PIPE_SWIZZLE_##z, \
+        PIPE_SWIZZLE_##w  \
+}
+
+static void
+write_texture_border_color(struct vc5_job *job,
+                           struct vc5_cl_out **uniforms,
+                           struct vc5_texture_stateobj *texstate,
+                           uint32_t unit)
+{
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc5_resource *rsc = vc5_resource(texture->texture);
+        union util_color uc;
+
+        const struct util_format_description *tex_format_desc =
+                util_format_description(texture->format);
+
+        float border_color[4];
+        for (int i = 0; i < 4; i++)
+                border_color[i] = sampler->border_color.f[i];
+        if (util_format_is_srgb(texture->format)) {
+                for (int i = 0; i < 3; i++)
+                        border_color[i] =
+                                util_format_linear_to_srgb_float(border_color[i]);
+        }
+
+        /* Turn the border color into the layout of channels that it would
+         * have when stored as texture contents.
+         */
+        float storage_color[4];
+        util_format_unswizzle_4f(storage_color,
+                                 border_color,
+                                 tex_format_desc->swizzle);
+
+        /* Now, pack so that when the vc5_format-sampled texture contents are
+         * replaced with our border color, the vc5_get_format_swizzle()
+         * swizzling will get the right channels.
+         */
+        if (util_format_is_depth_or_stencil(texture->format)) {
+                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
+                                       sampler->border_color.f[0]) << 8;
+        } else {
+                switch (rsc->vc5_format) {
+                default:
+                case VC5_TEXTURE_TYPE_RGBA8888:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+                        break;
+                case VC5_TEXTURE_TYPE_RGBA4444:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
+                        break;
+                case VC5_TEXTURE_TYPE_RGB565:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+                        break;
+                case VC5_TEXTURE_TYPE_ALPHA:
+                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
+                        break;
+                case VC5_TEXTURE_TYPE_LUMALPHA:
+                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
+                                    (float_to_ubyte(storage_color[0]) << 0));
+                        break;
+                }
+        }
+
+        cl_aligned_u32(uniforms, uc.ui[0]);
+}
+#endif
+
+static uint32_t
+get_texrect_scale(struct vc5_texture_stateobj *texstate,
+                  enum quniform_contents contents,
+                  uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+        uint32_t dim;
+
+        if (contents == QUNIFORM_TEXRECT_SCALE_X)
+                dim = texture->texture->width0;
+        else
+                dim = texture->texture->height0;
+
+        return fui(1.0f / dim);
+}
+
+static uint32_t
+get_texture_size(struct vc5_texture_stateobj *texstate,
+                 enum quniform_contents contents,
+                 uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+
+        switch (contents) {
+        case QUNIFORM_TEXTURE_WIDTH:
+                return u_minify(texture->texture->width0,
+                                texture->u.tex.first_level);
+        case QUNIFORM_TEXTURE_HEIGHT:
+                return u_minify(texture->texture->height0,
+                                texture->u.tex.first_level);
+        case QUNIFORM_TEXTURE_DEPTH:
+                return u_minify(texture->texture->depth0,
+                                texture->u.tex.first_level);
+        case QUNIFORM_TEXTURE_ARRAY_SIZE:
+                return texture->texture->array_size;
+        case QUNIFORM_TEXTURE_LEVELS:
+                return (texture->u.tex.last_level -
+                        texture->u.tex.first_level) + 1;
+        default:
+                unreachable("Bad texture size field");
+        }
+}
+
+static struct vc5_bo *
+vc5_upload_ubo(struct vc5_context *vc5,
+               struct vc5_compiled_shader *shader,
+               const uint32_t *gallium_uniforms)
+{
+        if (!shader->prog_data.base->ubo_size)
+                return NULL;
+
+        struct vc5_bo *ubo = vc5_bo_alloc(vc5->screen,
+                                          shader->prog_data.base->ubo_size,
+                                          "ubo");
+        void *data = vc5_bo_map(ubo);
+        for (uint32_t i = 0; i < shader->prog_data.base->num_ubo_ranges; i++) {
+                memcpy(data + shader->prog_data.base->ubo_ranges[i].dst_offset,
+                       ((const void *)gallium_uniforms +
+                        shader->prog_data.base->ubo_ranges[i].src_offset),
+                       shader->prog_data.base->ubo_ranges[i].size);
+        }
+
+        return ubo;
+}
+
+/**
+ *  Writes the V3D 3.x P0 (CFG_MODE=1) texture parameter.
+ *
+ * Some bits of this field are dependent on the type of sample being done by
+ * the shader, while other bits are dependent on the sampler state.  We OR the
+ * two together here.
+ */
+static void
+write_texture_p0(struct vc5_job *job,
+                 struct vc5_cl_out **uniforms,
+                 struct vc5_texture_stateobj *texstate,
+                 uint32_t unit,
+                 uint32_t shader_data)
+{
+        struct pipe_sampler_state *psampler = texstate->samplers[unit];
+        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+
+        cl_aligned_u32(uniforms, shader_data | sampler->p0);
+}
+
+/** Writes the V3D 3.x P1 (CFG_MODE=1) texture parameter. */
+static void
+write_texture_p1(struct vc5_job *job,
+                 struct vc5_cl_out **uniforms,
+                 struct vc5_texture_stateobj *texstate,
+                 uint32_t data)
+{
+        /* Extract the texture unit from the top bits, and the compiler's
+         * packed p1 from the bottom.
+         */
+        uint32_t unit = data >> 5;
+        uint32_t p1 = data & 0x1f;
+
+        struct pipe_sampler_view *psview = texstate->textures[unit];
+        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+
+        struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 unpacked = {
+                .texture_state_record_base_address = texstate->texture_state[unit],
+        };
+
+        uint32_t packed;
+        V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(&job->indirect,
+                                                         (uint8_t *)&packed,
+                                                         &unpacked);
+
+        cl_aligned_u32(uniforms, p1 | packed | sview->p1);
+}
+
+/** Writes the V3D 4.x TMU configuration parameter 0. */
+static void
+write_tmu_p0(struct vc5_job *job,
+             struct vc5_cl_out **uniforms,
+             struct vc5_texture_stateobj *texstate,
+             uint32_t data)
+{
+        /* Extract the texture unit from the top bits, and the compiler's
+         * packed p0 from the bottom.
+         */
+        uint32_t unit = data >> 24;
+        uint32_t p0 = data & 0x00ffffff;
+
+        struct pipe_sampler_view *psview = texstate->textures[unit];
+        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+        struct vc5_resource *rsc = vc5_resource(psview->texture);
+
+        cl_aligned_reloc(&job->indirect, uniforms, sview->bo, p0);
+        vc5_job_add_bo(job, rsc->bo);
+}
+
+/** Writes the V3D 4.x TMU configuration parameter 1. */
+static void
+write_tmu_p1(struct vc5_job *job,
+             struct vc5_cl_out **uniforms,
+             struct vc5_texture_stateobj *texstate,
+             uint32_t data)
+{
+        /* Extract the texture unit from the top bits, and the compiler's
+         * packed p1 from the bottom.
+         */
+        uint32_t unit = data >> 24;
+        uint32_t p0 = data & 0x00ffffff;
+
+        struct pipe_sampler_state *psampler = texstate->samplers[unit];
+        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+
+        cl_aligned_reloc(&job->indirect, uniforms, sampler->bo, p0);
+}
+
+struct vc5_cl_reloc
+vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader,
+                   struct vc5_constbuf_stateobj *cb,
+                   struct vc5_texture_stateobj *texstate)
+{
+        struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
+        struct vc5_job *job = vc5->job;
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+        struct vc5_bo *ubo = vc5_upload_ubo(vc5, shader, gallium_uniforms);
+
+        /* We always need to return some space for uniforms, because the HW
+         * will be prefetching, even if we don't read any in the program.
+         */
+        vc5_cl_ensure_space(&job->indirect, MAX2(uinfo->count, 1) * 4, 4);
+
+        struct vc5_cl_reloc uniform_stream = cl_get_address(&job->indirect);
+        vc5_bo_reference(uniform_stream.bo);
+
+        struct vc5_cl_out *uniforms =
+                cl_start(&job->indirect);
+
+        for (int i = 0; i < uinfo->count; i++) {
+
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        break;
+                case QUNIFORM_UNIFORM:
+                        cl_aligned_u32(&uniforms,
+                                       gallium_uniforms[uinfo->data[i]]);
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        cl_aligned_f(&uniforms, vc5->viewport.scale[0] * 256.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        cl_aligned_f(&uniforms, vc5->viewport.scale[1] * 256.0f);
+                        break;
+
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                        cl_aligned_f(&uniforms, vc5->viewport.translate[2]);
+                        break;
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        cl_aligned_f(&uniforms, vc5->viewport.scale[2]);
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        cl_aligned_f(&uniforms,
+                                     vc5->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                        break;
+
+                case QUNIFORM_TMU_CONFIG_P0:
+                        write_tmu_p0(job, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TMU_CONFIG_P1:
+                        write_tmu_p1(job, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        write_texture_p1(job, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+#if 0
+                case QUNIFORM_TEXTURE_FIRST_LEVEL:
+                        write_texture_first_level(job, &uniforms, texstate,
+                                                  uinfo->data[i]);
+                        break;
+#endif
+
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        cl_aligned_u32(&uniforms,
+                                       get_texrect_scale(texstate,
+                                                         uinfo->contents[i],
+                                                         uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_TEXTURE_WIDTH:
+                case QUNIFORM_TEXTURE_HEIGHT:
+                case QUNIFORM_TEXTURE_DEPTH:
+                case QUNIFORM_TEXTURE_ARRAY_SIZE:
+                case QUNIFORM_TEXTURE_LEVELS:
+                        cl_aligned_u32(&uniforms,
+                                       get_texture_size(texstate,
+                                                        uinfo->contents[i],
+                                                        uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_STENCIL:
+                        cl_aligned_u32(&uniforms,
+                                       vc5->zsa->stencil_uniforms[uinfo->data[i]] |
+                                       (uinfo->data[i] <= 1 ?
+                                        (vc5->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                        0));
+                        break;
+
+                case QUNIFORM_ALPHA_REF:
+                        cl_aligned_f(&uniforms,
+                                     vc5->zsa->base.alpha.ref_value);
+                        break;
+
+                case QUNIFORM_SAMPLE_MASK:
+                        cl_aligned_u32(&uniforms, vc5->sample_mask);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        if (uinfo->data[i] == 0) {
+                                cl_aligned_reloc(&job->indirect, &uniforms,
+                                                 ubo, 0);
+                        } else {
+                                int ubo_index = uinfo->data[i];
+                                struct vc5_resource *rsc =
+                                        vc5_resource(cb->cb[ubo_index].buffer);
+
+                                cl_aligned_reloc(&job->indirect, &uniforms,
+                                                 rsc->bo,
+                                                 cb->cb[ubo_index].buffer_offset);
+                        }
+                        break;
+
+                case QUNIFORM_TEXTURE_FIRST_LEVEL:
+                        cl_aligned_f(&uniforms,
+                                     texstate->textures[uinfo->data[i]]->u.tex.first_level);
+                        break;
+
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                        /* XXX */
+                        break;
+
+                case QUNIFORM_SPILL_OFFSET:
+                        cl_aligned_reloc(&job->indirect, &uniforms,
+                                         vc5->prog.spill_bo, 0);
+                        break;
+
+                case QUNIFORM_SPILL_SIZE_PER_THREAD:
+                        cl_aligned_u32(&uniforms,
+                                       vc5->prog.spill_size_per_thread);
+                        break;
+
+                default:
+                        assert(quniform_contents_is_texture_p0(uinfo->contents[i]));
+
+                        write_texture_p0(job, &uniforms, texstate,
+                                         uinfo->contents[i] -
+                                         QUNIFORM_TEXTURE_CONFIG_P0_0,
+                                         uinfo->data[i]);
+                        break;
+
+                }
+#if 0
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
+                fprintf(stderr, "shader %p[%d]: 0x%08x / 0x%08x (%f)\n",
+                        shader, i, __gen_address_offset(&uniform_stream) + i * 4,
+                        written_val, uif(written_val));
+#endif
+        }
+
+        cl_end(&job->indirect, uniforms);
+
+        vc5_bo_unreference(&ubo);
+
+        return uniform_stream;
+}
+
+void
+vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader)
+{
+        uint32_t dirty = 0;
+
+        for (int i = 0; i < shader->prog_data.base->uniforms.count; i++) {
+                switch (shader->prog_data.base->uniforms.contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        break;
+                case QUNIFORM_UNIFORM:
+                case QUNIFORM_UBO_ADDR:
+                        dirty |= VC5_DIRTY_CONSTBUF;
+                        break;
+
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        dirty |= VC5_DIRTY_VIEWPORT;
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        dirty |= VC5_DIRTY_CLIP;
+                        break;
+
+                case QUNIFORM_TMU_CONFIG_P0:
+                case QUNIFORM_TMU_CONFIG_P1:
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXTURE_FIRST_LEVEL:
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                case QUNIFORM_TEXTURE_WIDTH:
+                case QUNIFORM_TEXTURE_HEIGHT:
+                case QUNIFORM_TEXTURE_DEPTH:
+                case QUNIFORM_TEXTURE_ARRAY_SIZE:
+                case QUNIFORM_TEXTURE_LEVELS:
+                case QUNIFORM_SPILL_OFFSET:
+                case QUNIFORM_SPILL_SIZE_PER_THREAD:
+                        /* We could flag this on just the stage we're
+                         * compiling for, but it's not passed in.
+                         */
+                        dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX;
+                        break;
+
+                case QUNIFORM_STENCIL:
+                case QUNIFORM_ALPHA_REF:
+                        dirty |= VC5_DIRTY_ZSA;
+                        break;
+
+                case QUNIFORM_SAMPLE_MASK:
+                        dirty |= VC5_DIRTY_SAMPLE_MASK;
+                        break;
+
+                default:
+                        assert(quniform_contents_is_texture_p0(shader->prog_data.base->uniforms.contents[i]));
+                        dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX;
+                        break;
+                }
+        }
+
+        shader->uniform_dirty_bits = dirty;
+}
diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
new file mode 100644
index 00000000000..faeda2c0fbb
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_context.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* This file generates the per-v3d-version function prototypes.  It must only
+ * be included from v3d_context.h.
+ */
+
+struct v3d_hw;
+struct vc5_format;
+
+void v3dX(emit_state)(struct pipe_context *pctx);
+void v3dX(emit_rcl)(struct vc5_job *job);
+void v3dX(draw_init)(struct pipe_context *pctx);
+void v3dX(state_init)(struct pipe_context *pctx);
+
+void v3dX(bcl_epilogue)(struct vc5_context *vc5, struct vc5_job *job);
+
+void v3dX(simulator_init_regs)(struct v3d_hw *v3d);
+int v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
+                                    struct drm_v3d_get_param *args);
+void v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_v3d_submit_cl *submit,
+                           uint32_t gmp_ofs);
+const struct vc5_format *v3dX(get_format_desc)(enum pipe_format f);
+void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+                                                   uint32_t *type,
+                                                   uint32_t *bpp);
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
new file mode 100644
index 00000000000..03ee6b2b196
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_blitter.h"
+#include "util/u_prim.h"
+#include "util/u_format.h"
+#include "util/u_pack_color.h"
+#include "util/u_prim_restart.h"
+#include "util/u_upload_mgr.h"
+#include "indices/u_primconvert.h"
+
+#include "v3d_context.h"
+#include "v3d_resource.h"
+#include "v3d_cl.h"
+#include "broadcom/compiler/v3d_compiler.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+/**
+ * Does the initial bining command list setup for drawing to a given FBO.
+ */
+static void
+vc5_start_draw(struct vc5_context *vc5)
+{
+        struct vc5_job *job = vc5->job;
+
+        if (job->needs_flush)
+                return;
+
+        /* Get space to emit our BCL state, using a branch to jump to a new BO
+         * if necessary.
+         */
+        vc5_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
+
+        job->submit.bcl_start = job->bcl.bo->offset;
+        vc5_job_add_bo(job, job->bcl.bo);
+
+        job->tile_alloc = vc5_bo_alloc(vc5->screen, 1024 * 1024, "tile alloc");
+        uint32_t tsda_per_tile_size = vc5->screen->devinfo.ver >= 40 ? 256 : 64;
+        job->tile_state = vc5_bo_alloc(vc5->screen,
+                                       job->draw_tiles_y *
+                                       job->draw_tiles_x *
+                                       tsda_per_tile_size,
+                                       "TSDA");
+
+#if V3D_VERSION < 40
+        /* "Binning mode lists start with a Tile Binning Mode Configuration
+         * item (120)"
+         *
+         * Part1 signals the end of binning config setup.
+         */
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION_PART2, config) {
+                config.tile_allocation_memory_address =
+                        cl_address(job->tile_alloc, 0);
+                config.tile_allocation_memory_size = job->tile_alloc->size;
+        }
+#endif
+
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION_PART1, config) {
+#if V3D_VERSION >= 40
+                config.width_in_pixels_minus_1 = vc5->framebuffer.width - 1;
+                config.height_in_pixels_minus_1 = vc5->framebuffer.height - 1;
+                config.number_of_render_targets_minus_1 =
+                        MAX2(vc5->framebuffer.nr_cbufs, 1) - 1;
+#else /* V3D_VERSION < 40 */
+                config.tile_state_data_array_base_address =
+                        cl_address(job->tile_state, 0);
+
+                config.width_in_tiles = job->draw_tiles_x;
+                config.height_in_tiles = job->draw_tiles_y;
+                /* Must be >= 1 */
+                config.number_of_render_targets =
+                        MAX2(vc5->framebuffer.nr_cbufs, 1);
+#endif /* V3D_VERSION < 40 */
+
+                config.multisample_mode_4x = job->msaa;
+
+                config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+        }
+
+        /* There's definitely nothing in the VCD cache we want. */
+        cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+
+        /* Disable any leftover OQ state from another job. */
+        cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
+
+        /* "Binning mode lists must have a Start Tile Binning item (6) after
+         *  any prefix state data before the binning list proper starts."
+         */
+        cl_emit(&job->bcl, START_TILE_BINNING, bin);
+
+        job->needs_flush = true;
+        job->draw_width = vc5->framebuffer.width;
+        job->draw_height = vc5->framebuffer.height;
+}
+
+static void
+vc5_predraw_check_textures(struct pipe_context *pctx,
+                           struct vc5_texture_stateobj *stage_tex)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        for (int i = 0; i < stage_tex->num_textures; i++) {
+                struct pipe_sampler_view *view = stage_tex->textures[i];
+                if (!view)
+                        continue;
+
+                vc5_flush_jobs_writing_resource(vc5, view->texture);
+        }
+}
+
+static void
+vc5_emit_gl_shader_state(struct vc5_context *vc5,
+                         const struct pipe_draw_info *info)
+{
+        struct vc5_job *job = vc5->job;
+        /* VC5_DIRTY_VTXSTATE */
+        struct vc5_vertex_stateobj *vtx = vc5->vtx;
+        /* VC5_DIRTY_VTXBUF */
+        struct vc5_vertexbuf_stateobj *vertexbuf = &vc5->vertexbuf;
+
+        /* Upload the uniforms to the indirect CL first */
+        struct vc5_cl_reloc fs_uniforms =
+                vc5_write_uniforms(vc5, vc5->prog.fs,
+                                   &vc5->constbuf[PIPE_SHADER_FRAGMENT],
+                                   &vc5->fragtex);
+        struct vc5_cl_reloc vs_uniforms =
+                vc5_write_uniforms(vc5, vc5->prog.vs,
+                                   &vc5->constbuf[PIPE_SHADER_VERTEX],
+                                   &vc5->verttex);
+        struct vc5_cl_reloc cs_uniforms =
+                vc5_write_uniforms(vc5, vc5->prog.cs,
+                                   &vc5->constbuf[PIPE_SHADER_VERTEX],
+                                   &vc5->verttex);
+
+        /* See GFXH-930 workaround below */
+        uint32_t num_elements_to_emit = MAX2(vtx->num_elements, 1);
+        uint32_t shader_rec_offset =
+                vc5_cl_ensure_space(&job->indirect,
+                                    cl_packet_length(GL_SHADER_STATE_RECORD) +
+                                    num_elements_to_emit *
+                                    cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
+                                    32);
+
+        cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
+                shader.enable_clipping = true;
+                /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
+                shader.point_size_in_shaded_vertex_data =
+                        (info->mode == PIPE_PRIM_POINTS &&
+                         vc5->rasterizer->base.point_size_per_vertex);
+
+                /* Must be set if the shader modifies Z, discards, or modifies
+                 * the sample mask.  For any of these cases, the fragment
+                 * shader needs to write the Z value (even just discards).
+                 */
+                shader.fragment_shader_does_z_writes =
+                        (vc5->prog.fs->prog_data.fs->writes_z ||
+                         vc5->prog.fs->prog_data.fs->discard);
+
+                shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
+                        vc5->prog.fs->prog_data.fs->uses_centroid_and_center_w;
+
+                shader.number_of_varyings_in_fragment_shader =
+                        vc5->prog.fs->prog_data.base->num_inputs;
+
+                shader.propagate_nans = true;
+
+                shader.coordinate_shader_code_address =
+                        cl_address(vc5->prog.cs->bo, 0);
+                shader.vertex_shader_code_address =
+                        cl_address(vc5->prog.vs->bo, 0);
+                shader.fragment_shader_code_address =
+                        cl_address(vc5->prog.fs->bo, 0);
+
+                /* XXX: Use combined input/output size flag in the common
+                 * case.
+                 */
+                shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = true;
+                shader.vertex_shader_has_separate_input_and_output_vpm_blocks = true;
+                shader.coordinate_shader_input_vpm_segment_size =
+                        MAX2(vc5->prog.cs->prog_data.vs->vpm_input_size, 1);
+                shader.vertex_shader_input_vpm_segment_size =
+                        MAX2(vc5->prog.vs->prog_data.vs->vpm_input_size, 1);
+
+                shader.coordinate_shader_output_vpm_segment_size =
+                        vc5->prog.cs->prog_data.vs->vpm_output_size;
+                shader.vertex_shader_output_vpm_segment_size =
+                        vc5->prog.vs->prog_data.vs->vpm_output_size;
+
+                shader.coordinate_shader_uniforms_address = cs_uniforms;
+                shader.vertex_shader_uniforms_address = vs_uniforms;
+                shader.fragment_shader_uniforms_address = fs_uniforms;
+
+#if V3D_VERSION >= 41
+                shader.coordinate_shader_4_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                shader.vertex_shader_4_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                shader.fragment_shader_4_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+
+                shader.coordinate_shader_start_in_final_thread_section =
+                        vc5->prog.cs->prog_data.vs->base.single_seg;
+                shader.vertex_shader_start_in_final_thread_section =
+                        vc5->prog.vs->prog_data.vs->base.single_seg;
+                shader.fragment_shader_start_in_final_thread_section =
+                        vc5->prog.fs->prog_data.fs->base.single_seg;
+#else
+                shader.coordinate_shader_4_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                shader.coordinate_shader_2_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 2;
+                shader.vertex_shader_4_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                shader.vertex_shader_2_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 2;
+                shader.fragment_shader_4_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+                shader.fragment_shader_2_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 2;
+#endif
+
+                shader.vertex_id_read_by_coordinate_shader =
+                        vc5->prog.cs->prog_data.vs->uses_vid;
+                shader.instance_id_read_by_coordinate_shader =
+                        vc5->prog.cs->prog_data.vs->uses_iid;
+                shader.vertex_id_read_by_vertex_shader =
+                        vc5->prog.vs->prog_data.vs->uses_vid;
+                shader.instance_id_read_by_vertex_shader =
+                        vc5->prog.vs->prog_data.vs->uses_iid;
+
+                shader.address_of_default_attribute_values =
+                        cl_address(vtx->default_attribute_values, 0);
+        }
+
+        for (int i = 0; i < vtx->num_elements; i++) {
+                struct pipe_vertex_element *elem = &vtx->pipe[i];
+                struct pipe_vertex_buffer *vb =
+                        &vertexbuf->vb[elem->vertex_buffer_index];
+                struct vc5_resource *rsc = vc5_resource(vb->buffer.resource);
+
+                const uint32_t size =
+                        cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
+                cl_emit_with_prepacked(&job->indirect,
+                                       GL_SHADER_STATE_ATTRIBUTE_RECORD,
+                                       &vtx->attrs[i * size], attr) {
+                        attr.stride = vb->stride;
+                        attr.address = cl_address(rsc->bo,
+                                                  vb->buffer_offset +
+                                                  elem->src_offset);
+                        attr.number_of_values_read_by_coordinate_shader =
+                                vc5->prog.cs->prog_data.vs->vattr_sizes[i];
+                        attr.number_of_values_read_by_vertex_shader =
+                                vc5->prog.vs->prog_data.vs->vattr_sizes[i];
+#if V3D_VERSION >= 41
+                        attr.maximum_index = 0xffffff;
+#endif
+                }
+        }
+
+        if (vtx->num_elements == 0) {
+                /* GFXH-930: At least one attribute must be enabled and read
+                 * by CS and VS.  If we have no attributes being consumed by
+                 * the shader, set up a dummy to be loaded into the VPM.
+                 */
+                cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+                        /* Valid address of data whose value will be unused. */
+                        attr.address = cl_address(job->indirect.bo, 0);
+
+                        attr.type = ATTRIBUTE_FLOAT;
+                        attr.stride = 0;
+                        attr.vec_size = 1;
+
+                        attr.number_of_values_read_by_coordinate_shader = 1;
+                        attr.number_of_values_read_by_vertex_shader = 1;
+                }
+        }
+
+        cl_emit(&job->bcl, GL_SHADER_STATE, state) {
+                state.address = cl_address(job->indirect.bo, shader_rec_offset);
+                state.number_of_attribute_arrays = num_elements_to_emit;
+        }
+
+        vc5_bo_unreference(&cs_uniforms.bo);
+        vc5_bo_unreference(&vs_uniforms.bo);
+        vc5_bo_unreference(&fs_uniforms.bo);
+
+        job->shader_rec_count++;
+}
+
+/**
+ * Computes the various transform feedback statistics, since they can't be
+ * recorded by CL packets.
+ */
+static void
+vc5_tf_statistics_record(struct vc5_context *vc5,
+                         const struct pipe_draw_info *info,
+                         bool prim_tf)
+{
+        if (!vc5->active_queries)
+                return;
+
+        uint32_t prims = u_prims_for_vertices(info->mode, info->count);
+        vc5->prims_generated += prims;
+
+        if (prim_tf) {
+                /* XXX: Only count if we didn't overflow. */
+                vc5->tf_prims_generated += prims;
+        }
+}
+
+static void
+vc5_update_job_ez(struct vc5_context *vc5, struct vc5_job *job)
+{
+        switch (vc5->zsa->ez_state) {
+        case VC5_EZ_UNDECIDED:
+                /* If the Z/S state didn't pick a direction but didn't
+                 * disable, then go along with the current EZ state.  This
+                 * allows EZ optimization for Z func == EQUAL or NEVER.
+                 */
+                break;
+
+        case VC5_EZ_LT_LE:
+        case VC5_EZ_GT_GE:
+                /* If the Z/S state picked a direction, then it needs to match
+                 * the current direction if we've decided on one.
+                 */
+                if (job->ez_state == VC5_EZ_UNDECIDED)
+                        job->ez_state = vc5->zsa->ez_state;
+                else if (job->ez_state != vc5->zsa->ez_state)
+                        job->ez_state = VC5_EZ_DISABLED;
+                break;
+
+        case VC5_EZ_DISABLED:
+                /* If the current Z/S state disables EZ because of a bad Z
+                 * func or stencil operation, then we can't do any more EZ in
+                 * this frame.
+                 */
+                job->ez_state = VC5_EZ_DISABLED;
+                break;
+        }
+
+        /* If the FS affects the Z of the pixels, then it may update against
+         * the chosen EZ direction (though we could use
+         * ARB_conservative_depth's hints to avoid this)
+         */
+        if (vc5->prog.fs->prog_data.fs->writes_z) {
+                job->ez_state = VC5_EZ_DISABLED;
+        }
+
+        if (job->first_ez_state == VC5_EZ_UNDECIDED)
+                job->first_ez_state = job->ez_state;
+}
+
+static void
+vc5_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        if (!info->count_from_stream_output && !info->indirect &&
+            !info->primitive_restart &&
+            !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+                return;
+
+        /* Fall back for weird desktop GL primitive restart values. */
+        if (info->primitive_restart &&
+            info->index_size) {
+                uint32_t mask = ~0;
+
+                switch (info->index_size) {
+                case 2:
+                        mask = 0xffff;
+                        break;
+                case 1:
+                        mask = 0xff;
+                        break;
+                }
+
+                if (info->restart_index != mask) {
+                        util_draw_vbo_without_prim_restart(pctx, info);
+                        return;
+                }
+        }
+
+        if (info->mode >= PIPE_PRIM_QUADS) {
+                util_primconvert_save_rasterizer_state(vc5->primconvert, &vc5->rasterizer->base);
+                util_primconvert_draw_vbo(vc5->primconvert, info);
+                perf_debug("Fallback conversion for %d %s vertices\n",
+                           info->count, u_prim_name(info->mode));
+                return;
+        }
+
+        /* Before setting up the draw, flush anything writing to the textures
+         * that we read from.
+         */
+        vc5_predraw_check_textures(pctx, &vc5->verttex);
+        vc5_predraw_check_textures(pctx, &vc5->fragtex);
+
+        struct vc5_job *job = vc5_get_job_for_fbo(vc5);
+
+        /* Get space to emit our draw call into the BCL, using a branch to
+         * jump to a new BO if necessary.
+         */
+        vc5_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
+
+        if (vc5->prim_mode != info->mode) {
+                vc5->prim_mode = info->mode;
+                vc5->dirty |= VC5_DIRTY_PRIM_MODE;
+        }
+
+        vc5_start_draw(vc5);
+        vc5_update_compiled_shaders(vc5, info->mode);
+        vc5_update_job_ez(vc5, job);
+
+#if V3D_VERSION >= 41
+        v3d41_emit_state(pctx);
+#else
+        v3d33_emit_state(pctx);
+#endif
+
+        if (vc5->dirty & (VC5_DIRTY_VTXBUF |
+                          VC5_DIRTY_VTXSTATE |
+                          VC5_DIRTY_PRIM_MODE |
+                          VC5_DIRTY_RASTERIZER |
+                          VC5_DIRTY_COMPILED_CS |
+                          VC5_DIRTY_COMPILED_VS |
+                          VC5_DIRTY_COMPILED_FS |
+                          vc5->prog.cs->uniform_dirty_bits |
+                          vc5->prog.vs->uniform_dirty_bits |
+                          vc5->prog.fs->uniform_dirty_bits)) {
+                vc5_emit_gl_shader_state(vc5, info);
+        }
+
+        vc5->dirty = 0;
+
+        /* The Base Vertex/Base Instance packet sets those values to nonzero
+         * for the next draw call only.
+         */
+        if (info->index_bias || info->start_instance) {
+                cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
+                        base.base_instance = info->start_instance;
+                        base.base_vertex = info->index_bias;
+                }
+        }
+
+        uint32_t prim_tf_enable = 0;
+#if V3D_VERSION < 40
+        /* V3D 3.x: The HW only processes transform feedback on primitives
+         * with the flag set.
+         */
+        if (vc5->streamout.num_targets)
+                prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
+#endif
+
+        vc5_tf_statistics_record(vc5, info, vc5->streamout.num_targets);
+
+        /* Note that the primitive type fields match with OpenGL/gallium
+         * definitions, up to but not including QUADS.
+         */
+        if (info->index_size) {
+                uint32_t index_size = info->index_size;
+                uint32_t offset = info->start * index_size;
+                struct pipe_resource *prsc;
+                if (info->has_user_indices) {
+                        prsc = NULL;
+                        u_upload_data(vc5->uploader, 0,
+                                      info->count * info->index_size, 4,
+                                      info->index.user,
+                                      &offset, &prsc);
+                } else {
+                        prsc = info->index.resource;
+                }
+                struct vc5_resource *rsc = vc5_resource(prsc);
+
+#if V3D_VERSION >= 40
+                cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
+                        ib.address = cl_address(rsc->bo, 0);
+                        ib.size = rsc->bo->size;
+                }
+#endif
+
+                if (info->instance_count > 1) {
+                        cl_emit(&job->bcl, INDEXED_INSTANCED_PRIMITIVE_LIST, prim) {
+                                prim.index_type = ffs(info->index_size) - 1;
+#if V3D_VERSION >= 40
+                                prim.index_offset = offset;
+#else /* V3D_VERSION < 40 */
+                                prim.maximum_index = (1u << 31) - 1; /* XXX */
+                                prim.address_of_indices_list =
+                                        cl_address(rsc->bo, offset);
+#endif /* V3D_VERSION < 40 */
+                                prim.mode = info->mode | prim_tf_enable;
+                                prim.enable_primitive_restarts = info->primitive_restart;
+
+                                prim.number_of_instances = info->instance_count;
+                                prim.instance_length = info->count;
+                        }
+                } else {
+                        cl_emit(&job->bcl, INDEXED_PRIMITIVE_LIST, prim) {
+                                prim.index_type = ffs(info->index_size) - 1;
+                                prim.length = info->count;
+#if V3D_VERSION >= 40
+                                prim.index_offset = offset;
+#else /* V3D_VERSION < 40 */
+                                prim.maximum_index = (1u << 31) - 1; /* XXX */
+                                prim.address_of_indices_list =
+                                        cl_address(rsc->bo, offset);
+#endif /* V3D_VERSION < 40 */
+                                prim.mode = info->mode | prim_tf_enable;
+                                prim.enable_primitive_restarts = info->primitive_restart;
+                        }
+                }
+
+                job->draw_calls_queued++;
+
+                if (info->has_user_indices)
+                        pipe_resource_reference(&prsc, NULL);
+        } else {
+                if (info->instance_count > 1) {
+                        cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMITIVES, prim) {
+                                prim.mode = info->mode | prim_tf_enable;
+                                prim.index_of_first_vertex = info->start;
+                                prim.number_of_instances = info->instance_count;
+                                prim.instance_length = info->count;
+                        }
+                } else {
+                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, prim) {
+                                prim.mode = info->mode | prim_tf_enable;
+                                prim.length = info->count;
+                                prim.index_of_first_vertex = info->start;
+                        }
+                }
+        }
+        job->draw_calls_queued++;
+
+        if (vc5->zsa && job->zsbuf &&
+            (vc5->zsa->base.depth.enabled ||
+             vc5->zsa->base.stencil[0].enabled)) {
+                struct vc5_resource *rsc = vc5_resource(job->zsbuf->texture);
+                vc5_job_add_bo(job, rsc->bo);
+
+                if (vc5->zsa->base.depth.enabled) {
+                        job->resolve |= PIPE_CLEAR_DEPTH;
+                        rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
+                }
+
+                if (vc5->zsa->base.stencil[0].enabled) {
+                        job->resolve |= PIPE_CLEAR_STENCIL;
+                        rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
+                }
+        }
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
+
+                if (job->resolve & bit || !job->cbufs[i])
+                        continue;
+                struct vc5_resource *rsc = vc5_resource(job->cbufs[i]->texture);
+
+                job->resolve |= bit;
+                vc5_job_add_bo(job, rsc->bo);
+        }
+
+        if (job->referenced_size > 768 * 1024 * 1024) {
+                perf_debug("Flushing job with %dkb to try to free up memory\n",
+                        job->referenced_size / 1024);
+                vc5_flush(pctx);
+        }
+
+        if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+                vc5_flush(pctx);
+}
+
+static void
+vc5_clear(struct pipe_context *pctx, unsigned buffers,
+          const union pipe_color_union *color, double depth, unsigned stencil)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_job *job = vc5_get_job_for_fbo(vc5);
+
+        /* We can't flag new buffers for clearing once we've queued draws.  We
+         * could avoid this by using the 3d engine to clear.
+         */
+        if (job->draw_calls_queued) {
+                perf_debug("Flushing rendering to process new clear.\n");
+                vc5_job_submit(vc5, job);
+                job = vc5_get_job_for_fbo(vc5);
+        }
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
+                if (!(buffers & bit))
+                        continue;
+
+                struct pipe_surface *psurf = vc5->framebuffer.cbufs[i];
+                struct vc5_surface *surf = vc5_surface(psurf);
+                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+
+                union util_color uc;
+                uint32_t internal_size = 4 << surf->internal_bpp;
+
+                static union pipe_color_union swapped_color;
+                if (vc5->swap_color_rb & (1 << i)) {
+                        swapped_color.f[0] = color->f[2];
+                        swapped_color.f[1] = color->f[1];
+                        swapped_color.f[2] = color->f[0];
+                        swapped_color.f[3] = color->f[3];
+                        color = &swapped_color;
+                }
+
+                switch (surf->internal_type) {
+                case V3D_INTERNAL_TYPE_8:
+                        util_pack_color(color->f, PIPE_FORMAT_R8G8B8A8_UNORM,
+                                        &uc);
+                        memcpy(job->clear_color[i], uc.ui, internal_size);
+                        break;
+                case V3D_INTERNAL_TYPE_8I:
+                case V3D_INTERNAL_TYPE_8UI:
+                        job->clear_color[i][0] = ((color->ui[0] & 0xff) |
+                                                  (color->ui[1] & 0xff) << 8 |
+                                                  (color->ui[2] & 0xff) << 16 |
+                                                  (color->ui[3] & 0xff) << 24);
+                        break;
+                case V3D_INTERNAL_TYPE_16F:
+                        util_pack_color(color->f, PIPE_FORMAT_R16G16B16A16_FLOAT,
+                                        &uc);
+                        memcpy(job->clear_color[i], uc.ui, internal_size);
+                        break;
+                case V3D_INTERNAL_TYPE_16I:
+                case V3D_INTERNAL_TYPE_16UI:
+                        job->clear_color[i][0] = ((color->ui[0] & 0xffff) |
+                                                  color->ui[1] << 16);
+                        job->clear_color[i][1] = ((color->ui[2] & 0xffff) |
+                                                  color->ui[3] << 16);
+                        break;
+                case V3D_INTERNAL_TYPE_32F:
+                case V3D_INTERNAL_TYPE_32I:
+                case V3D_INTERNAL_TYPE_32UI:
+                        memcpy(job->clear_color[i], color->ui, internal_size);
+                        break;
+                }
+
+                rsc->initialized_buffers |= bit;
+        }
+
+        unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
+        if (zsclear) {
+                struct vc5_resource *rsc =
+                        vc5_resource(vc5->framebuffer.zsbuf->texture);
+
+                if (zsclear & PIPE_CLEAR_DEPTH)
+                        job->clear_z = depth;
+                if (zsclear & PIPE_CLEAR_STENCIL)
+                        job->clear_s = stencil;
+
+                rsc->initialized_buffers |= zsclear;
+        }
+
+        job->draw_min_x = 0;
+        job->draw_min_y = 0;
+        job->draw_max_x = vc5->framebuffer.width;
+        job->draw_max_y = vc5->framebuffer.height;
+        job->cleared |= buffers;
+        job->resolve |= buffers;
+
+        vc5_start_draw(vc5);
+}
+
+static void
+vc5_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
+                        const union pipe_color_union *color,
+                        unsigned x, unsigned y, unsigned w, unsigned h,
+                        bool render_condition_enabled)
+{
+        fprintf(stderr, "unimpl: clear RT\n");
+}
+
+static void
+vc5_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
+                        unsigned buffers, double depth, unsigned stencil,
+                        unsigned x, unsigned y, unsigned w, unsigned h,
+                        bool render_condition_enabled)
+{
+        fprintf(stderr, "unimpl: clear DS\n");
+}
+
+void
+v3dX(draw_init)(struct pipe_context *pctx)
+{
+        pctx->draw_vbo = vc5_draw_vbo;
+        pctx->clear = vc5_clear;
+        pctx->clear_render_target = vc5_clear_render_target;
+        pctx->clear_depth_stencil = vc5_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
new file mode 100644
index 00000000000..e2aba356de4
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_emit.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_half.h"
+#include "v3d_context.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+static uint8_t
+vc5_factor(enum pipe_blendfactor factor, bool dst_alpha_one)
+{
+        /* We may get a bad blendfactor when blending is disabled. */
+        if (factor == 0)
+                return V3D_BLEND_FACTOR_ZERO;
+
+        switch (factor) {
+        case PIPE_BLENDFACTOR_ZERO:
+                return V3D_BLEND_FACTOR_ZERO;
+        case PIPE_BLENDFACTOR_ONE:
+                return V3D_BLEND_FACTOR_ONE;
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return V3D_BLEND_FACTOR_SRC_COLOR;
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return V3D_BLEND_FACTOR_INV_SRC_COLOR;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return V3D_BLEND_FACTOR_DST_COLOR;
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return V3D_BLEND_FACTOR_INV_DST_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return V3D_BLEND_FACTOR_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return V3D_BLEND_FACTOR_INV_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return (dst_alpha_one ?
+                        V3D_BLEND_FACTOR_ONE :
+                        V3D_BLEND_FACTOR_DST_ALPHA);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return (dst_alpha_one ?
+                        V3D_BLEND_FACTOR_ZERO :
+                        V3D_BLEND_FACTOR_INV_DST_ALPHA);
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return V3D_BLEND_FACTOR_CONST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return V3D_BLEND_FACTOR_INV_CONST_COLOR;
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return V3D_BLEND_FACTOR_CONST_ALPHA;
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return V3D_BLEND_FACTOR_INV_CONST_ALPHA;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                return V3D_BLEND_FACTOR_SRC_ALPHA_SATURATE;
+        default:
+                unreachable("Bad blend factor");
+        }
+}
+
+static inline uint16_t
+swizzled_border_color(const struct v3d_device_info *devinfo,
+                      struct pipe_sampler_state *sampler,
+                      struct vc5_sampler_view *sview,
+                      int chan)
+{
+        const struct util_format_description *desc =
+                util_format_description(sview->base.format);
+        uint8_t swiz = chan;
+
+        /* If we're doing swizzling in the sampler, then only rearrange the
+         * border color for the mismatch between the VC5 texture format and
+         * the PIPE_FORMAT, since GL_ARB_texture_swizzle will be handled by
+         * the sampler's swizzle.
+         *
+         * For swizzling in the shader, we don't do any pre-swizzling of the
+         * border color.
+         */
+        if (vc5_get_tex_return_size(devinfo, sview->base.format,
+                                    sampler->compare_mode) != 32)
+                swiz = desc->swizzle[swiz];
+
+        switch (swiz) {
+        case PIPE_SWIZZLE_0:
+                return util_float_to_half(0.0);
+        case PIPE_SWIZZLE_1:
+                return util_float_to_half(1.0);
+        default:
+                return util_float_to_half(sampler->border_color.f[swiz]);
+        }
+}
+
+#if V3D_VERSION < 40
+static uint32_t
+translate_swizzle(unsigned char pipe_swizzle)
+{
+        switch (pipe_swizzle) {
+        case PIPE_SWIZZLE_0:
+                return 0;
+        case PIPE_SWIZZLE_1:
+                return 1;
+        case PIPE_SWIZZLE_X:
+        case PIPE_SWIZZLE_Y:
+        case PIPE_SWIZZLE_Z:
+        case PIPE_SWIZZLE_W:
+                return 2 + pipe_swizzle;
+        default:
+                unreachable("unknown swizzle");
+        }
+}
+
+static void
+emit_one_texture(struct vc5_context *vc5, struct vc5_texture_stateobj *stage_tex,
+                 int i)
+{
+        struct vc5_job *job = vc5->job;
+        struct pipe_sampler_state *psampler = stage_tex->samplers[i];
+        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+        struct pipe_sampler_view *psview = stage_tex->textures[i];
+        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+        struct pipe_resource *prsc = psview->texture;
+        struct vc5_resource *rsc = vc5_resource(prsc);
+        const struct v3d_device_info *devinfo = &vc5->screen->devinfo;
+
+        stage_tex->texture_state[i].offset =
+                vc5_cl_ensure_space(&job->indirect,
+                                    cl_packet_length(TEXTURE_SHADER_STATE),
+                                    32);
+        vc5_bo_set_reference(&stage_tex->texture_state[i].bo,
+                             job->indirect.bo);
+
+        uint32_t return_size = vc5_get_tex_return_size(devinfo, psview->format,
+                                                       psampler->compare_mode);
+
+        struct V3D33_TEXTURE_SHADER_STATE unpacked = {
+                /* XXX */
+                .border_color_red = swizzled_border_color(devinfo, psampler,
+                                                          sview, 0),
+                .border_color_green = swizzled_border_color(devinfo, psampler,
+                                                            sview, 1),
+                .border_color_blue = swizzled_border_color(devinfo, psampler,
+                                                           sview, 2),
+                .border_color_alpha = swizzled_border_color(devinfo, psampler,
+                                                            sview, 3),
+
+                /* In the normal texturing path, the LOD gets clamped between
+                 * min/max, and the base_level field (set in the sampler view
+                 * from first_level) only decides where the min/mag switch
+                 * happens, so we need to use the LOD clamps to keep us
+                 * between min and max.
+                 *
+                 * For txf, the LOD clamp is still used, despite GL not
+                 * wanting that.  We will need to have a separate
+                 * TEXTURE_SHADER_STATE that ignores psview->min/max_lod to
+                 * support txf properly.
+                 */
+                .min_level_of_detail = MIN2(psview->u.tex.first_level +
+                                            MAX2(psampler->min_lod, 0),
+                                            psview->u.tex.last_level),
+                .max_level_of_detail = MIN2(psview->u.tex.first_level +
+                                            psampler->max_lod,
+                                            psview->u.tex.last_level),
+
+                .texture_base_pointer = cl_address(rsc->bo,
+                                                   rsc->slices[0].offset),
+
+                .output_32_bit = return_size == 32,
+        };
+
+        /* Set up the sampler swizzle if we're doing 16-bit sampling.  For
+         * 32-bit, we leave swizzling up to the shader compiler.
+         *
+         * Note: Contrary to the docs, the swizzle still applies even if the
+         * return size is 32.  It's just that you probably want to swizzle in
+         * the shader, because you need the Y/Z/W channels to be defined.
+         */
+        if (return_size == 32) {
+                unpacked.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
+                unpacked.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
+                unpacked.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
+                unpacked.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
+        } else {
+                unpacked.swizzle_r = translate_swizzle(sview->swizzle[0]);
+                unpacked.swizzle_g = translate_swizzle(sview->swizzle[1]);
+                unpacked.swizzle_b = translate_swizzle(sview->swizzle[2]);
+                unpacked.swizzle_a = translate_swizzle(sview->swizzle[3]);
+        }
+
+        int min_img_filter = psampler->min_img_filter;
+        int min_mip_filter = psampler->min_mip_filter;
+        int mag_img_filter = psampler->mag_img_filter;
+
+        if (return_size == 32) {
+                min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+                mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+                mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+        }
+
+        bool min_nearest = min_img_filter == PIPE_TEX_FILTER_NEAREST;
+        switch (min_mip_filter) {
+        case PIPE_TEX_MIPFILTER_NONE:
+                unpacked.filter += min_nearest ? 2 : 0;
+                break;
+        case PIPE_TEX_MIPFILTER_NEAREST:
+                unpacked.filter += min_nearest ? 4 : 8;
+                break;
+        case PIPE_TEX_MIPFILTER_LINEAR:
+                unpacked.filter += min_nearest ? 4 : 8;
+                unpacked.filter += 2;
+                break;
+        }
+
+        if (mag_img_filter == PIPE_TEX_FILTER_NEAREST)
+                unpacked.filter++;
+
+        if (psampler->max_anisotropy > 8)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_16_1;
+        else if (psampler->max_anisotropy > 4)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_8_1;
+        else if (psampler->max_anisotropy > 2)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_4_1;
+        else if (psampler->max_anisotropy)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_2_1;
+
+        uint8_t packed[cl_packet_length(TEXTURE_SHADER_STATE)];
+        cl_packet_pack(TEXTURE_SHADER_STATE)(&job->indirect, packed, &unpacked);
+
+        for (int i = 0; i < ARRAY_SIZE(packed); i++)
+                packed[i] |= sview->texture_shader_state[i] | sampler->texture_shader_state[i];
+
+        /* TMU indirect structs need to be 32b aligned. */
+        vc5_cl_ensure_space(&job->indirect, ARRAY_SIZE(packed), 32);
+        cl_emit_prepacked(&job->indirect, &packed);
+}
+
+static void
+emit_textures(struct vc5_context *vc5, struct vc5_texture_stateobj *stage_tex)
+{
+        for (int i = 0; i < stage_tex->num_textures; i++) {
+                if (stage_tex->textures[i])
+                        emit_one_texture(vc5, stage_tex, i);
+        }
+}
+#endif /* V3D_VERSION < 40 */
+
+static uint32_t
+translate_colormask(struct vc5_context *vc5, uint32_t colormask, int rt)
+{
+        if (vc5->swap_color_rb & (1 << rt)) {
+                colormask = ((colormask & (2 | 8)) |
+                             ((colormask & 1) << 2) |
+                             ((colormask & 4) >> 2));
+        }
+
+        return (~colormask) & 0xf;
+}
+
+static void
+emit_rt_blend(struct vc5_context *vc5, struct vc5_job *job,
+              struct pipe_blend_state *blend, int rt)
+{
+        cl_emit(&job->bcl, BLEND_CONFIG, config) {
+                struct pipe_rt_blend_state *rtblend = &blend->rt[rt];
+
+#if V3D_VERSION >= 40
+                config.render_target_mask = 1 << rt;
+#else
+                assert(rt == 0);
+#endif
+
+                config.colour_blend_mode = rtblend->rgb_func;
+                config.colour_blend_dst_factor =
+                        vc5_factor(rtblend->rgb_dst_factor,
+                                   vc5->blend_dst_alpha_one);
+                config.colour_blend_src_factor =
+                        vc5_factor(rtblend->rgb_src_factor,
+                                   vc5->blend_dst_alpha_one);
+
+                config.alpha_blend_mode = rtblend->alpha_func;
+                config.alpha_blend_dst_factor =
+                        vc5_factor(rtblend->alpha_dst_factor,
+                                   vc5->blend_dst_alpha_one);
+                config.alpha_blend_src_factor =
+                        vc5_factor(rtblend->alpha_src_factor,
+                                   vc5->blend_dst_alpha_one);
+        }
+}
+
+void
+v3dX(emit_state)(struct pipe_context *pctx)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_job *job = vc5->job;
+        bool rasterizer_discard = vc5->rasterizer->base.rasterizer_discard;
+
+        if (vc5->dirty & (VC5_DIRTY_SCISSOR | VC5_DIRTY_VIEWPORT |
+                          VC5_DIRTY_RASTERIZER)) {
+                float *vpscale = vc5->viewport.scale;
+                float *vptranslate = vc5->viewport.translate;
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
+
+                /* Clip to the scissor if it's enabled, but still clip to the
+                 * drawable regardless since that controls where the binner
+                 * tries to put things.
+                 *
+                 * Additionally, always clip the rendering to the viewport,
+                 * since the hardware does guardband clipping, meaning
+                 * primitives would rasterize outside of the view volume.
+                 */
+                uint32_t minx, miny, maxx, maxy;
+                if (!vc5->rasterizer->base.scissor) {
+                        minx = MAX2(vp_minx, 0);
+                        miny = MAX2(vp_miny, 0);
+                        maxx = MIN2(vp_maxx, job->draw_width);
+                        maxy = MIN2(vp_maxy, job->draw_height);
+                } else {
+                        minx = MAX2(vp_minx, vc5->scissor.minx);
+                        miny = MAX2(vp_miny, vc5->scissor.miny);
+                        maxx = MIN2(vp_maxx, vc5->scissor.maxx);
+                        maxy = MIN2(vp_maxy, vc5->scissor.maxy);
+                }
+
+                cl_emit(&job->bcl, CLIP_WINDOW, clip) {
+                        clip.clip_window_left_pixel_coordinate = minx;
+                        clip.clip_window_bottom_pixel_coordinate = miny;
+                        clip.clip_window_width_in_pixels = maxx - minx;
+                        clip.clip_window_height_in_pixels = maxy - miny;
+
+#if V3D_VERSION < 41
+                        /* The HW won't entirely clip out when scissor w/h is
+                         * 0.  Just treat it the same as rasterizer discard.
+                         */
+                        if (clip.clip_window_width_in_pixels == 0 ||
+                            clip.clip_window_height_in_pixels == 0) {
+                                rasterizer_discard = true;
+                                clip.clip_window_width_in_pixels = 1;
+                                clip.clip_window_height_in_pixels = 1;
+                        }
+#endif
+                }
+
+                job->draw_min_x = MIN2(job->draw_min_x, minx);
+                job->draw_min_y = MIN2(job->draw_min_y, miny);
+                job->draw_max_x = MAX2(job->draw_max_x, maxx);
+                job->draw_max_y = MAX2(job->draw_max_y, maxy);
+        }
+
+        if (vc5->dirty & (VC5_DIRTY_RASTERIZER |
+                          VC5_DIRTY_ZSA |
+                          VC5_DIRTY_BLEND |
+                          VC5_DIRTY_COMPILED_FS)) {
+                cl_emit(&job->bcl, CONFIGURATION_BITS, config) {
+                        config.enable_forward_facing_primitive =
+                                !rasterizer_discard &&
+                                !(vc5->rasterizer->base.cull_face &
+                                  PIPE_FACE_FRONT);
+                        config.enable_reverse_facing_primitive =
+                                !rasterizer_discard &&
+                                !(vc5->rasterizer->base.cull_face &
+                                  PIPE_FACE_BACK);
+                        /* This seems backwards, but it's what gets the
+                         * clipflat test to pass.
+                         */
+                        config.clockwise_primitives =
+                                vc5->rasterizer->base.front_ccw;
+
+                        config.enable_depth_offset =
+                                vc5->rasterizer->base.offset_tri;
+
+                        config.rasterizer_oversample_mode =
+                                vc5->rasterizer->base.multisample;
+
+                        config.direct3d_provoking_vertex =
+                                vc5->rasterizer->base.flatshade_first;
+
+                        config.blend_enable = vc5->blend->rt[0].blend_enable;
+
+                        /* Note: EZ state may update based on the compiled FS,
+                         * along with ZSA
+                         */
+                        config.early_z_updates_enable =
+                                (job->ez_state != VC5_EZ_DISABLED);
+                        if (vc5->zsa->base.depth.enabled) {
+                                config.z_updates_enable =
+                                        vc5->zsa->base.depth.writemask;
+                                config.early_z_enable =
+                                        config.early_z_updates_enable;
+                                config.depth_test_function =
+                                        vc5->zsa->base.depth.func;
+                        } else {
+                                config.depth_test_function = PIPE_FUNC_ALWAYS;
+                        }
+
+                        config.stencil_enable =
+                                vc5->zsa->base.stencil[0].enabled;
+                }
+
+        }
+
+        if (vc5->dirty & VC5_DIRTY_RASTERIZER &&
+            vc5->rasterizer->base.offset_tri) {
+                cl_emit(&job->bcl, DEPTH_OFFSET, depth) {
+                        depth.depth_offset_factor =
+                                vc5->rasterizer->offset_factor;
+                        depth.depth_offset_units =
+                                vc5->rasterizer->offset_units;
+                }
+        }
+
+        if (vc5->dirty & VC5_DIRTY_RASTERIZER) {
+                cl_emit(&job->bcl, POINT_SIZE, point_size) {
+                        point_size.point_size = vc5->rasterizer->point_size;
+                }
+
+                cl_emit(&job->bcl, LINE_WIDTH, line_width) {
+                        line_width.line_width = vc5->rasterizer->base.line_width;
+                }
+        }
+
+        if (vc5->dirty & VC5_DIRTY_VIEWPORT) {
+                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                        clip.viewport_half_width_in_1_256th_of_pixel =
+                                vc5->viewport.scale[0] * 256.0f;
+                        clip.viewport_half_height_in_1_256th_of_pixel =
+                                vc5->viewport.scale[1] * 256.0f;
+                }
+
+                cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+                        clip.viewport_z_offset_zc_to_zs =
+                                vc5->viewport.translate[2];
+                        clip.viewport_z_scale_zc_to_zs =
+                                vc5->viewport.scale[2];
+                }
+                cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+                        clip.minimum_zw = (vc5->viewport.translate[2] -
+                                           vc5->viewport.scale[2]);
+                        clip.maximum_zw = (vc5->viewport.translate[2] +
+                                           vc5->viewport.scale[2]);
+                }
+
+                cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
+                        vp.viewport_centre_x_coordinate =
+                                vc5->viewport.translate[0];
+                        vp.viewport_centre_y_coordinate =
+                                vc5->viewport.translate[1];
+                }
+        }
+
+        if (vc5->dirty & VC5_DIRTY_BLEND && vc5->blend->rt[0].blend_enable) {
+                struct pipe_blend_state *blend = vc5->blend;
+
+                if (blend->independent_blend_enable) {
+                        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++)
+                                emit_rt_blend(vc5, job, blend, i);
+                } else {
+                        emit_rt_blend(vc5, job, blend, 0);
+                }
+        }
+
+        if (vc5->dirty & VC5_DIRTY_BLEND) {
+                struct pipe_blend_state *blend = vc5->blend;
+
+                cl_emit(&job->bcl, COLOUR_WRITE_MASKS, mask) {
+                        if (blend->independent_blend_enable) {
+                                mask.render_target_0_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[0].colormask, 0);
+                                mask.render_target_1_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[1].colormask, 1);
+                                mask.render_target_2_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[2].colormask, 2);
+                                mask.render_target_3_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[3].colormask, 3);
+                        } else {
+                                mask.render_target_0_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[0].colormask, 0);
+                                mask.render_target_1_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[0].colormask, 1);
+                                mask.render_target_2_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[0].colormask, 2);
+                                mask.render_target_3_per_colour_component_write_masks =
+                                        translate_colormask(vc5, blend->rt[0].colormask, 3);
+                        }
+                }
+        }
+
+        /* GFXH-1431: On V3D 3.x, writing BLEND_CONFIG resets the constant
+         * color.
+         */
+        if (vc5->dirty & VC5_DIRTY_BLEND_COLOR ||
+            (V3D_VERSION < 41 && (vc5->dirty & VC5_DIRTY_BLEND))) {
+                cl_emit(&job->bcl, BLEND_CONSTANT_COLOUR, colour) {
+                        colour.red_f16 = (vc5->swap_color_rb ?
+                                          vc5->blend_color.hf[2] :
+                                          vc5->blend_color.hf[0]);
+                        colour.green_f16 = vc5->blend_color.hf[1];
+                        colour.blue_f16 = (vc5->swap_color_rb ?
+                                           vc5->blend_color.hf[0] :
+                                           vc5->blend_color.hf[2]);
+                        colour.alpha_f16 = vc5->blend_color.hf[3];
+                }
+        }
+
+        if (vc5->dirty & (VC5_DIRTY_ZSA | VC5_DIRTY_STENCIL_REF)) {
+                struct pipe_stencil_state *front = &vc5->zsa->base.stencil[0];
+                struct pipe_stencil_state *back = &vc5->zsa->base.stencil[1];
+
+                if (front->enabled) {
+                        cl_emit_with_prepacked(&job->bcl, STENCIL_CONFIG,
+                                               vc5->zsa->stencil_front, config) {
+                                config.stencil_ref_value =
+                                        vc5->stencil_ref.ref_value[0];
+                        }
+                }
+
+                if (back->enabled) {
+                        cl_emit_with_prepacked(&job->bcl, STENCIL_CONFIG,
+                                               vc5->zsa->stencil_back, config) {
+                                config.stencil_ref_value =
+                                        vc5->stencil_ref.ref_value[1];
+                        }
+                }
+        }
+
+#if V3D_VERSION < 40
+        /* Pre-4.x, we have texture state that depends on both the sampler and
+         * the view, so we merge them together at draw time.
+         */
+        if (vc5->dirty & VC5_DIRTY_FRAGTEX)
+                emit_textures(vc5, &vc5->fragtex);
+
+        if (vc5->dirty & VC5_DIRTY_VERTTEX)
+                emit_textures(vc5, &vc5->verttex);
+#endif
+
+        if (vc5->dirty & VC5_DIRTY_FLAT_SHADE_FLAGS) {
+                bool emitted_any = false;
+
+                for (int i = 0; i < ARRAY_SIZE(vc5->prog.fs->prog_data.fs->flat_shade_flags); i++) {
+                        if (!vc5->prog.fs->prog_data.fs->flat_shade_flags[i])
+                                continue;
+
+                        cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
+                                flags.varying_offset_v0 = i;
+
+                                if (emitted_any) {
+                                        flags.action_for_flat_shade_flags_of_lower_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
+                                        flags.action_for_flat_shade_flags_of_higher_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
+                                } else {
+                                        flags.action_for_flat_shade_flags_of_lower_numbered_varyings =
+                                                ((i == 0) ?
+                                                 V3D_VARYING_FLAGS_ACTION_UNCHANGED :
+                                                 V3D_VARYING_FLAGS_ACTION_ZEROED);
+
+                                        flags.action_for_flat_shade_flags_of_higher_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_ZEROED;
+                                }
+
+                                flags.flat_shade_flags_for_varyings_v024 =
+                                        vc5->prog.fs->prog_data.fs->flat_shade_flags[i];
+                        }
+
+                        emitted_any = true;
+                }
+
+                if (!emitted_any) {
+                        cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
+                }
+        }
+
+#if V3D_VERSION >= 40
+        if (vc5->dirty & VC5_DIRTY_CENTROID_FLAGS) {
+                bool emitted_any = false;
+
+                for (int i = 0; i < ARRAY_SIZE(vc5->prog.fs->prog_data.fs->centroid_flags); i++) {
+                        if (!vc5->prog.fs->prog_data.fs->centroid_flags[i])
+                                continue;
+
+                        cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
+                                flags.varying_offset_v0 = i;
+
+                                if (emitted_any) {
+                                        flags.action_for_centroid_flags_of_lower_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
+                                        flags.action_for_centroid_flags_of_higher_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
+                                } else {
+                                        flags.action_for_centroid_flags_of_lower_numbered_varyings =
+                                                ((i == 0) ?
+                                                 V3D_VARYING_FLAGS_ACTION_UNCHANGED :
+                                                 V3D_VARYING_FLAGS_ACTION_ZEROED);
+
+                                        flags.action_for_centroid_flags_of_higher_numbered_varyings =
+                                                V3D_VARYING_FLAGS_ACTION_ZEROED;
+                                }
+
+                                flags.centroid_flags_for_varyings_v024 =
+                                        vc5->prog.fs->prog_data.fs->centroid_flags[i];
+                        }
+
+                        emitted_any = true;
+                }
+
+                if (!emitted_any) {
+                        cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
+                }
+        }
+#endif
+
+        /* Set up the transform feedback data specs (which VPM entries to
+         * output to which buffers).
+         */
+        if (vc5->dirty & (VC5_DIRTY_STREAMOUT |
+                          VC5_DIRTY_RASTERIZER |
+                          VC5_DIRTY_PRIM_MODE)) {
+                struct vc5_streamout_stateobj *so = &vc5->streamout;
+
+                if (so->num_targets) {
+                        bool psiz_per_vertex = (vc5->prim_mode == PIPE_PRIM_POINTS &&
+                                                vc5->rasterizer->base.point_size_per_vertex);
+                        uint16_t *tf_specs = (psiz_per_vertex ?
+                                              vc5->prog.bind_vs->tf_specs_psiz :
+                                              vc5->prog.bind_vs->tf_specs);
+
+#if V3D_VERSION >= 40
+                        job->tf_enabled = (vc5->prog.bind_vs->num_tf_specs != 0 &&
+                                           vc5->active_queries);
+
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
+                                tfe.number_of_16_bit_output_data_specs_following =
+                                        vc5->prog.bind_vs->num_tf_specs;
+                                tfe.enable = job->tf_enabled;
+                        };
+#else /* V3D_VERSION < 40 */
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_ENABLE, tfe) {
+                                tfe.number_of_32_bit_output_buffer_address_following =
+                                        so->num_targets;
+                                tfe.number_of_16_bit_output_data_specs_following =
+                                        vc5->prog.bind_vs->num_tf_specs;
+                        };
+#endif /* V3D_VERSION < 40 */
+                        for (int i = 0; i < vc5->prog.bind_vs->num_tf_specs; i++) {
+                                cl_emit_prepacked(&job->bcl, &tf_specs[i]);
+                        }
+                } else if (job->tf_enabled) {
+#if V3D_VERSION >= 40
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
+                                tfe.enable = false;
+                        };
+                        job->tf_enabled = false;
+#endif /* V3D_VERSION >= 40 */
+                }
+        }
+
+        /* Set up the trasnform feedback buffers. */
+        if (vc5->dirty & VC5_DIRTY_STREAMOUT) {
+                struct vc5_streamout_stateobj *so = &vc5->streamout;
+                for (int i = 0; i < so->num_targets; i++) {
+                        const struct pipe_stream_output_target *target =
+                                so->targets[i];
+                        struct vc5_resource *rsc = target ?
+                                vc5_resource(target->buffer) : NULL;
+
+#if V3D_VERSION >= 40
+                        if (!target)
+                                continue;
+
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_BUFFER, output) {
+                                output.buffer_address =
+                                        cl_address(rsc->bo,
+                                                   target->buffer_offset);
+                                output.buffer_size_in_32_bit_words =
+                                        target->buffer_size >> 2;
+                                output.buffer_number = i;
+                        }
+#else /* V3D_VERSION < 40 */
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_OUTPUT_ADDRESS, output) {
+                                if (target) {
+                                        output.address =
+                                                cl_address(rsc->bo,
+                                                           target->buffer_offset);
+                                }
+                        };
+#endif /* V3D_VERSION < 40 */
+                        if (target) {
+                                vc5_job_add_write_resource(vc5->job,
+                                                           target->buffer);
+                        }
+                        /* XXX: buffer_size? */
+                }
+        }
+
+        if (vc5->dirty & VC5_DIRTY_OQ) {
+                cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
+                        job->oq_enabled = vc5->active_queries && vc5->current_oq;
+                        if (job->oq_enabled) {
+                                counter.address = cl_address(vc5->current_oq, 0);
+                        }
+                }
+        }
+}
diff --git a/src/gallium/drivers/v3d/v3dx_format_table.c b/src/gallium/drivers/v3d/v3dx_format_table.c
new file mode 100644
index 00000000000..458488119c7
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_format_table.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright © 2014-2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+
+#include "v3d_context.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/common/v3d_macros.h"
+#include "v3d_format_table.h"
+
+#define SWIZ(x,y,z,w) {          \
+        PIPE_SWIZZLE_##x, \
+        PIPE_SWIZZLE_##y, \
+        PIPE_SWIZZLE_##z, \
+        PIPE_SWIZZLE_##w  \
+}
+
+#define FORMAT(pipe, rt, tex, swiz, return_size, return_channels)       \
+        [PIPE_FORMAT_##pipe] = {                                        \
+                true,                                                   \
+                V3D_OUTPUT_IMAGE_FORMAT_##rt,                           \
+                TEXTURE_DATA_FORMAT_##tex,                              \
+                swiz,                                                   \
+                return_size,                                            \
+                return_channels,                                        \
+        }
+
+#define SWIZ_X001	SWIZ(X, 0, 0, 1)
+#define SWIZ_XY01	SWIZ(X, Y, 0, 1)
+#define SWIZ_XYZ1	SWIZ(X, Y, Z, 1)
+#define SWIZ_XYZW	SWIZ(X, Y, Z, W)
+#define SWIZ_YZWX	SWIZ(Y, Z, W, X)
+#define SWIZ_YZW1	SWIZ(Y, Z, W, 1)
+#define SWIZ_ZYXW	SWIZ(Z, Y, X, W)
+#define SWIZ_ZYX1	SWIZ(Z, Y, X, 1)
+#define SWIZ_XXXY	SWIZ(X, X, X, Y)
+#define SWIZ_XXX1	SWIZ(X, X, X, 1)
+#define SWIZ_XXXX	SWIZ(X, X, X, X)
+#define SWIZ_000X	SWIZ(0, 0, 0, X)
+
+static const struct vc5_format format_table[] = {
+        FORMAT(B8G8R8A8_UNORM,    RGBA8,        RGBA8,       SWIZ_ZYXW, 16, 0),
+        FORMAT(B8G8R8X8_UNORM,    RGBA8,        RGBA8,       SWIZ_ZYX1, 16, 0),
+        FORMAT(B8G8R8A8_SRGB,     SRGB8_ALPHA8, RGBA8,       SWIZ_ZYXW, 16, 0),
+        FORMAT(B8G8R8X8_SRGB,     SRGB8_ALPHA8, RGBA8,       SWIZ_ZYX1, 16, 0),
+        FORMAT(R8G8B8A8_UNORM,    RGBA8,        RGBA8,       SWIZ_XYZW, 16, 0),
+        FORMAT(R8G8B8X8_UNORM,    RGBA8,        RGBA8,       SWIZ_XYZ1, 16, 0),
+        FORMAT(R8G8B8A8_SNORM,    NO,           RGBA8_SNORM, SWIZ_XYZW, 16, 0),
+        FORMAT(R8G8B8X8_SNORM,    NO,           RGBA8_SNORM, SWIZ_XYZ1, 16, 0),
+        FORMAT(R10G10B10A2_UNORM, RGB10_A2,     RGB10_A2,    SWIZ_XYZW, 16, 0),
+        FORMAT(R10G10B10A2_UINT,  RGB10_A2UI,   RGB10_A2UI,  SWIZ_XYZW, 16, 0),
+
+        FORMAT(A4B4G4R4_UNORM,    ABGR4444,     RGBA4,       SWIZ_XYZW, 16, 0),
+
+        FORMAT(A1B5G5R5_UNORM,    ABGR1555,     RGB5_A1,     SWIZ_XYZW, 16, 0),
+        FORMAT(X1B5G5R5_UNORM,    ABGR1555,     RGB5_A1,     SWIZ_XYZ1, 16, 0),
+        FORMAT(B5G6R5_UNORM,      BGR565,       RGB565,      SWIZ_XYZ1, 16, 0),
+
+        FORMAT(R8_UNORM,          R8,           R8,          SWIZ_X001, 16, 0),
+        FORMAT(R8_SNORM,          NO,           R8_SNORM,    SWIZ_X001, 16, 0),
+        FORMAT(R8G8_UNORM,        RG8,          RG8,         SWIZ_XY01, 16, 0),
+        FORMAT(R8G8_SNORM,        NO,           RG8_SNORM,   SWIZ_XY01, 16, 0),
+
+        FORMAT(R16_UNORM,         NO,           R16,         SWIZ_X001, 32, 1),
+        FORMAT(R16_SNORM,         NO,           R16_SNORM,   SWIZ_X001, 32, 1),
+        FORMAT(R16_FLOAT,         R16F,         R16F,        SWIZ_X001, 16, 0),
+        FORMAT(R32_FLOAT,         R32F,         R32F,        SWIZ_X001, 32, 1),
+
+        FORMAT(R16G16_UNORM,      NO,           RG16,        SWIZ_XY01, 32, 2),
+        FORMAT(R16G16_SNORM,      NO,           RG16_SNORM,  SWIZ_XY01, 32, 2),
+        FORMAT(R16G16_FLOAT,      RG16F,        RG16F,       SWIZ_XY01, 16, 0),
+        FORMAT(R32G32_FLOAT,      RG32F,        RG32F,       SWIZ_XY01, 32, 2),
+
+        FORMAT(R16G16B16A16_UNORM, NO,          RGBA16,      SWIZ_XYZW, 32, 4),
+        FORMAT(R16G16B16A16_SNORM, NO,          RGBA16_SNORM, SWIZ_XYZW, 32, 4),
+        FORMAT(R16G16B16A16_FLOAT, RGBA16F,     RGBA16F,     SWIZ_XYZW, 16, 0),
+        FORMAT(R32G32B32A32_FLOAT, RGBA32F,     RGBA32F,     SWIZ_XYZW, 32, 4),
+
+        /* If we don't have L/A/LA16, mesa/st will fall back to RGBA16. */
+        FORMAT(L16_UNORM,         NO,           R16,         SWIZ_XXX1, 32, 1),
+        FORMAT(L16_SNORM,         NO,           R16_SNORM,   SWIZ_XXX1, 32, 1),
+        FORMAT(I16_UNORM,         NO,           R16,         SWIZ_XXXX, 32, 1),
+        FORMAT(I16_SNORM,         NO,           R16_SNORM,   SWIZ_XXXX, 32, 1),
+        FORMAT(A16_UNORM,         NO,           R16,         SWIZ_000X, 32, 1),
+        FORMAT(A16_SNORM,         NO,           R16_SNORM,   SWIZ_000X, 32, 1),
+        FORMAT(L16A16_UNORM,      NO,           RG16,        SWIZ_XXXY, 32, 2),
+        FORMAT(L16A16_SNORM,      NO,           RG16_SNORM,  SWIZ_XXXY, 32, 2),
+
+        FORMAT(A8_UNORM,          NO,           R8,          SWIZ_000X, 16, 0),
+        FORMAT(L8_UNORM,          NO,           R8,          SWIZ_XXX1, 16, 0),
+        FORMAT(I8_UNORM,          NO,           R8,          SWIZ_XXXX, 16, 0),
+        FORMAT(L8A8_UNORM,        NO,           RG8,         SWIZ_XXXY, 16, 0),
+
+        FORMAT(R8_SINT,           R8I,          R8I,         SWIZ_X001, 16, 0),
+        FORMAT(R8_UINT,           R8UI,         R8UI,        SWIZ_X001, 16, 0),
+        FORMAT(R8G8_SINT,         RG8I,         RG8I,        SWIZ_XY01, 16, 0),
+        FORMAT(R8G8_UINT,         RG8UI,        RG8UI,       SWIZ_XY01, 16, 0),
+        FORMAT(R8G8B8A8_SINT,     RGBA8I,       RGBA8I,      SWIZ_XYZW, 16, 0),
+        FORMAT(R8G8B8A8_UINT,     RGBA8UI,      RGBA8UI,     SWIZ_XYZW, 16, 0),
+
+        FORMAT(R16_SINT,          R16I,         R16I,        SWIZ_X001, 16, 0),
+        FORMAT(R16_UINT,          R16UI,        R16UI,       SWIZ_X001, 16, 0),
+        FORMAT(R16G16_SINT,       RG16I,        RG16I,       SWIZ_XY01, 16, 0),
+        FORMAT(R16G16_UINT,       RG16UI,       RG16UI,      SWIZ_XY01, 16, 0),
+        FORMAT(R16G16B16A16_SINT, RGBA16I,      RGBA16I,     SWIZ_XYZW, 16, 0),
+        FORMAT(R16G16B16A16_UINT, RGBA16UI,     RGBA16UI,    SWIZ_XYZW, 16, 0),
+
+        FORMAT(R32_SINT,          R32I,         R32I,        SWIZ_X001, 32, 1),
+        FORMAT(R32_UINT,          R32UI,        R32UI,       SWIZ_X001, 32, 1),
+        FORMAT(R32G32_SINT,       RG32I,        RG32I,       SWIZ_XY01, 32, 2),
+        FORMAT(R32G32_UINT,       RG32UI,       RG32UI,      SWIZ_XY01, 32, 2),
+        FORMAT(R32G32B32A32_SINT, RGBA32I,      RGBA32I,     SWIZ_XYZW, 32, 4),
+        FORMAT(R32G32B32A32_UINT, RGBA32UI,     RGBA32UI,    SWIZ_XYZW, 32, 4),
+
+        FORMAT(A8_SINT,           R8I,          R8I,         SWIZ_000X, 16, 0),
+        FORMAT(A8_UINT,           R8UI,         R8UI,        SWIZ_000X, 16, 0),
+        FORMAT(A16_SINT,          R16I,         R16I,        SWIZ_000X, 16, 0),
+        FORMAT(A16_UINT,          R16UI,        R16UI,       SWIZ_000X, 16, 0),
+        FORMAT(A32_SINT,          R32I,         R32I,        SWIZ_000X, 32, 1),
+        FORMAT(A32_UINT,          R32UI,        R32UI,       SWIZ_000X, 32, 1),
+
+        FORMAT(R11G11B10_FLOAT,   R11F_G11F_B10F, R11F_G11F_B10F, SWIZ_XYZW, 16, 0),
+        FORMAT(R9G9B9E5_FLOAT,    NO,           RGB9_E5,     SWIZ_XYZW, 16, 0),
+
+#if V3D_VERSION >= 40
+        FORMAT(S8_UINT_Z24_UNORM, D24S8,        DEPTH24_X8,  SWIZ_XXXX, 32, 1),
+        FORMAT(X8Z24_UNORM,       D24S8,        DEPTH24_X8,  SWIZ_XXXX, 32, 1),
+        FORMAT(S8X24_UINT,        S8,           R32F,        SWIZ_XXXX, 32, 1),
+        FORMAT(Z32_FLOAT,         D32F,         R32F,        SWIZ_XXXX, 32, 1),
+        FORMAT(Z16_UNORM,         D16,          DEPTH_COMP16,SWIZ_XXXX, 32, 1),
+
+        /* Pretend we support this, but it'll be separate Z32F depth and S8. */
+        FORMAT(Z32_FLOAT_S8X24_UINT, D32F,      R32F,        SWIZ_XXXX, 32, 1),
+#else
+        FORMAT(S8_UINT_Z24_UNORM, ZS_DEPTH24_STENCIL8, DEPTH24_X8, SWIZ_XXXX, 32, 1),
+        FORMAT(X8Z24_UNORM,       ZS_DEPTH24_STENCIL8, DEPTH24_X8, SWIZ_XXXX, 32, 1),
+        FORMAT(S8X24_UINT,        NO,           R32F,        SWIZ_XXXX, 32, 1),
+        FORMAT(Z32_FLOAT,         ZS_DEPTH_COMPONENT32F, R32F, SWIZ_XXXX, 32, 1),
+        FORMAT(Z16_UNORM,         ZS_DEPTH_COMPONENT16,  DEPTH_COMP16, SWIZ_XXXX, 32, 1),
+
+        /* Pretend we support this, but it'll be separate Z32F depth and S8. */
+        FORMAT(Z32_FLOAT_S8X24_UINT, ZS_DEPTH_COMPONENT32F, R32F, SWIZ_XXXX, 32, 1),
+#endif
+
+        FORMAT(ETC2_RGB8,         NO,           RGB8_ETC2,   SWIZ_XYZ1, 16, 0),
+        FORMAT(ETC2_SRGB8,        NO,           RGB8_ETC2,   SWIZ_XYZ1, 16, 0),
+        FORMAT(ETC2_RGB8A1,       NO,           RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, 0),
+        FORMAT(ETC2_SRGB8A1,      NO,           RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, 0),
+        FORMAT(ETC2_RGBA8,        NO,           RGBA8_ETC2_EAC, SWIZ_XYZW, 16, 0),
+        FORMAT(ETC2_SRGBA8,       NO,           RGBA8_ETC2_EAC, SWIZ_XYZW, 16, 0),
+        FORMAT(ETC2_R11_UNORM,    NO,           R11_EAC,     SWIZ_X001, 16, 0),
+        FORMAT(ETC2_R11_SNORM,    NO,           SIGNED_R11_EAC, SWIZ_X001, 16, 0),
+        FORMAT(ETC2_RG11_UNORM,   NO,           RG11_EAC,    SWIZ_XY01, 16, 0),
+        FORMAT(ETC2_RG11_SNORM,   NO,           SIGNED_RG11_EAC, SWIZ_XY01, 16, 0),
+
+        FORMAT(DXT1_RGB,          NO,           BC1,         SWIZ_XYZ1, 16, 0),
+        FORMAT(DXT3_RGBA,         NO,           BC2,         SWIZ_XYZ1, 16, 0),
+        FORMAT(DXT5_RGBA,         NO,           BC3,         SWIZ_XYZ1, 16, 0),
+};
+
+const struct vc5_format *
+v3dX(get_format_desc)(enum pipe_format f)
+{
+        if (f < ARRAY_SIZE(format_table) && format_table[f].present)
+                return &format_table[f];
+        else
+                return NULL;
+}
+
+void
+v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+                                              uint32_t *type,
+                                              uint32_t *bpp)
+{
+        switch (format) {
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA8:
+#if V3D_VERSION < 41
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBX8:
+#endif
+        case V3D_OUTPUT_IMAGE_FORMAT_RGB8:
+        case V3D_OUTPUT_IMAGE_FORMAT_RG8:
+        case V3D_OUTPUT_IMAGE_FORMAT_R8:
+        case V3D_OUTPUT_IMAGE_FORMAT_ABGR4444:
+        case V3D_OUTPUT_IMAGE_FORMAT_BGR565:
+        case V3D_OUTPUT_IMAGE_FORMAT_ABGR1555:
+                *type = V3D_INTERNAL_TYPE_8;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA8I:
+        case V3D_OUTPUT_IMAGE_FORMAT_RG8I:
+        case V3D_OUTPUT_IMAGE_FORMAT_R8I:
+                *type = V3D_INTERNAL_TYPE_8I;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI:
+        case V3D_OUTPUT_IMAGE_FORMAT_RG8UI:
+        case V3D_OUTPUT_IMAGE_FORMAT_R8UI:
+                *type = V3D_INTERNAL_TYPE_8UI;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_SRGB8_ALPHA8:
+        case V3D_OUTPUT_IMAGE_FORMAT_SRGB:
+        case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2:
+        case V3D_OUTPUT_IMAGE_FORMAT_R11F_G11F_B10F:
+#if V3D_VERSION < 41
+        case V3D_OUTPUT_IMAGE_FORMAT_SRGBX8:
+#endif
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA16F:
+                /* Note that sRGB RTs are stored in the tile buffer at 16F,
+                 * and the conversion to sRGB happens at tilebuffer
+                 * load/store.
+                 */
+                *type = V3D_INTERNAL_TYPE_16F;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RG16F:
+        case V3D_OUTPUT_IMAGE_FORMAT_R16F:
+                *type = V3D_INTERNAL_TYPE_16F;
+                /* Use 64bpp to make sure the TLB doesn't throw away the alpha
+                 * channel before alpha test happens.
+                 */
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA16I:
+                *type = V3D_INTERNAL_TYPE_16I;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_RG16I:
+        case V3D_OUTPUT_IMAGE_FORMAT_R16I:
+                *type = V3D_INTERNAL_TYPE_16I;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2UI:
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA16UI:
+                *type = V3D_INTERNAL_TYPE_16UI;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_RG16UI:
+        case V3D_OUTPUT_IMAGE_FORMAT_R16UI:
+                *type = V3D_INTERNAL_TYPE_16UI;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA32I:
+                *type = V3D_INTERNAL_TYPE_32I;
+                *bpp = V3D_INTERNAL_BPP_128;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_RG32I:
+                *type = V3D_INTERNAL_TYPE_32I;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_R32I:
+                *type = V3D_INTERNAL_TYPE_32I;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA32UI:
+                *type = V3D_INTERNAL_TYPE_32UI;
+                *bpp = V3D_INTERNAL_BPP_128;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_RG32UI:
+                *type = V3D_INTERNAL_TYPE_32UI;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_R32UI:
+                *type = V3D_INTERNAL_TYPE_32UI;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        case V3D_OUTPUT_IMAGE_FORMAT_RGBA32F:
+                *type = V3D_INTERNAL_TYPE_32F;
+                *bpp = V3D_INTERNAL_BPP_128;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_RG32F:
+                *type = V3D_INTERNAL_TYPE_32F;
+                *bpp = V3D_INTERNAL_BPP_64;
+                break;
+        case V3D_OUTPUT_IMAGE_FORMAT_R32F:
+                *type = V3D_INTERNAL_TYPE_32F;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+
+        default:
+                /* Provide some default values, as we'll be called at RB
+                 * creation time, even if an RB with this format isn't
+                 * supported.
+                 */
+                *type = V3D_INTERNAL_TYPE_8;
+                *bpp = V3D_INTERNAL_BPP_32;
+                break;
+        }
+}
diff --git a/src/gallium/drivers/v3d/v3dx_job.c b/src/gallium/drivers/v3d/v3dx_job.c
new file mode 100644
index 00000000000..5e1a345b170
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_job.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file v3dx_job.c
+ *
+ * V3D version-specific functions for submitting VC5 render jobs to the
+ * kernel.
+ */
+
+#include "v3d_context.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+void v3dX(bcl_epilogue)(struct vc5_context *vc5, struct vc5_job *job)
+{
+                vc5_cl_ensure_space_with_branch(&job->bcl,
+                                                cl_packet_length(OCCLUSION_QUERY_COUNTER) +
+#if V3D_VERSION >= 41
+                                                cl_packet_length(TRANSFORM_FEEDBACK_SPECS) +
+#endif
+                                                cl_packet_length(INCREMENT_SEMAPHORE) +
+                                                cl_packet_length(FLUSH_ALL_STATE));
+
+                if (job->oq_enabled) {
+                        /* Disable the OQ at the end of the CL, so that the
+                         * draw calls at the start of the CL don't inherit the
+                         * OQ counter.
+                         */
+                        cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
+                }
+
+                /* Disable TF at the end of the CL, so that the next job to be
+                 * run doesn't start out trying to write TF primitives.  On
+                 * V3D 3.x, it's only the TF primitive mode that triggers TF
+                 * writes.
+                 */
+#if V3D_VERSION >= 41
+                if (job->tf_enabled) {
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
+                                tfe.enable = false;
+                        };
+                }
+#endif /* V3D_VERSION >= 41 */
+
+                /* Increment the semaphore indicating that binning is done and
+                 * unblocking the render thread.  Note that this doesn't act
+                 * until the FLUSH completes.
+                 */
+                cl_emit(&job->bcl, INCREMENT_SEMAPHORE, incr);
+
+                /* The FLUSH_ALL emits any unwritten state changes in each
+                 * tile.  We can use this to reset any state that needs to be
+                 * present at the start of the next tile, as we do with
+                 * OCCLUSION_QUERY_COUNTER above.
+                 */
+                cl_emit(&job->bcl, FLUSH_ALL_STATE, flush);
+}
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
new file mode 100644
index 00000000000..3801d03ecee
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "v3d_context.h"
+#include "v3d_tiling.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+#define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 |                   \
+                                  PIPE_CLEAR_COLOR1 |                   \
+                                  PIPE_CLEAR_COLOR2 |                   \
+                                  PIPE_CLEAR_COLOR3)                    \
+
+#define PIPE_FIRST_COLOR_BUFFER_BIT (ffs(PIPE_CLEAR_COLOR0) - 1)
+
+/* The HW queues up the load until the tile coordinates show up, but can only
+ * track one at a time.  If we need to do more than one load, then we need to
+ * flush out the previous load by emitting the tile coordinates and doing a
+ * dummy store.
+ */
+static void
+flush_last_load(struct vc5_cl *cl)
+{
+        if (V3D_VERSION >= 40)
+                return;
+
+        cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+        cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+                store.buffer_to_store = NONE;
+        }
+}
+
+static void
+load_general(struct vc5_cl *cl, struct pipe_surface *psurf, int buffer,
+             uint32_t pipe_bit, uint32_t *loads_pending)
+{
+        struct vc5_surface *surf = vc5_surface(psurf);
+        bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
+        if (separate_stencil) {
+                psurf = surf->separate_stencil;
+                surf = vc5_surface(psurf);
+        }
+
+        struct vc5_resource *rsc = vc5_resource(psurf->texture);
+
+        cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
+                load.buffer_to_load = buffer;
+                load.address = cl_address(rsc->bo, surf->offset);
+
+#if V3D_VERSION >= 40
+                load.memory_format = surf->tiling;
+                if (separate_stencil)
+                        load.input_image_format = V3D_OUTPUT_IMAGE_FORMAT_S8;
+                else
+                        load.input_image_format = surf->format;
+
+                if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
+                    surf->tiling == VC5_TILING_UIF_XOR) {
+                        load.height_in_ub_or_stride =
+                                surf->padded_height_of_output_image_in_uif_blocks;
+                } else if (surf->tiling == VC5_TILING_RASTER) {
+                        struct vc5_resource_slice *slice =
+                                &rsc->slices[psurf->u.tex.level];
+                        load.height_in_ub_or_stride = slice->stride;
+                }
+
+                if (psurf->texture->nr_samples > 1)
+                        load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+                else
+                        load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+
+#else /* V3D_VERSION < 40 */
+                /* Can't do raw ZSTENCIL loads -- need to load/store them to
+                 * separate buffers for Z and stencil.
+                 */
+                assert(buffer != ZSTENCIL);
+                load.raw_mode = true;
+                load.padded_height_of_output_image_in_uif_blocks =
+                        surf->padded_height_of_output_image_in_uif_blocks;
+#endif /* V3D_VERSION < 40 */
+        }
+
+        *loads_pending &= ~pipe_bit;
+        if (*loads_pending)
+                flush_last_load(cl);
+}
+
+static void
+store_general(struct vc5_job *job,
+              struct vc5_cl *cl, struct pipe_surface *psurf, int buffer,
+              int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
+{
+        struct vc5_surface *surf = vc5_surface(psurf);
+        bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
+        if (separate_stencil) {
+                psurf = surf->separate_stencil;
+                surf = vc5_surface(psurf);
+        }
+
+        *stores_pending &= ~pipe_bit;
+        bool last_store = !(*stores_pending);
+
+        struct vc5_resource *rsc = vc5_resource(psurf->texture);
+
+        rsc->writes++;
+
+        cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+                store.buffer_to_store = buffer;
+                store.address = cl_address(rsc->bo, surf->offset);
+
+#if V3D_VERSION >= 40
+                store.clear_buffer_being_stored =
+                        ((job->cleared & pipe_bit) &&
+                         (general_color_clear ||
+                          !(pipe_bit & PIPE_CLEAR_COLOR_BUFFERS)));
+
+                if (separate_stencil)
+                        store.output_image_format = V3D_OUTPUT_IMAGE_FORMAT_S8;
+                else
+                        store.output_image_format = surf->format;
+
+                store.memory_format = surf->tiling;
+
+                if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
+                    surf->tiling == VC5_TILING_UIF_XOR) {
+                        store.height_in_ub_or_stride =
+                                surf->padded_height_of_output_image_in_uif_blocks;
+                } else if (surf->tiling == VC5_TILING_RASTER) {
+                        struct vc5_resource_slice *slice =
+                                &rsc->slices[psurf->u.tex.level];
+                        store.height_in_ub_or_stride = slice->stride;
+                }
+
+                if (psurf->texture->nr_samples > 1)
+                        store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+                else
+                        store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+
+#else /* V3D_VERSION < 40 */
+                /* Can't do raw ZSTENCIL stores -- need to load/store them to
+                 * separate buffers for Z and stencil.
+                 */
+                assert(buffer != ZSTENCIL);
+                store.raw_mode = true;
+                if (!last_store) {
+                        store.disable_colour_buffers_clear_on_write = true;
+                        store.disable_z_buffer_clear_on_write = true;
+                        store.disable_stencil_buffer_clear_on_write = true;
+                } else {
+                        store.disable_colour_buffers_clear_on_write =
+                                !(((pipe_bit & PIPE_CLEAR_COLOR_BUFFERS) &&
+                                   general_color_clear &&
+                                   (job->cleared & pipe_bit)));
+                        store.disable_z_buffer_clear_on_write =
+                                !(job->cleared & PIPE_CLEAR_DEPTH);
+                        store.disable_stencil_buffer_clear_on_write =
+                                !(job->cleared & PIPE_CLEAR_STENCIL);
+                }
+                store.padded_height_of_output_image_in_uif_blocks =
+                        surf->padded_height_of_output_image_in_uif_blocks;
+#endif /* V3D_VERSION < 40 */
+        }
+
+        /* There must be a TILE_COORDINATES_IMPLICIT between each store. */
+        if (V3D_VERSION < 40 && !last_store) {
+                cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+        }
+}
+
+static int
+zs_buffer_from_pipe_bits(int pipe_clear_bits)
+{
+        switch (pipe_clear_bits & PIPE_CLEAR_DEPTHSTENCIL) {
+        case PIPE_CLEAR_DEPTHSTENCIL:
+                return ZSTENCIL;
+        case PIPE_CLEAR_DEPTH:
+                return Z;
+        case PIPE_CLEAR_STENCIL:
+                return STENCIL;
+        default:
+                return NONE;
+        }
+}
+
+static void
+vc5_rcl_emit_loads(struct vc5_job *job, struct vc5_cl *cl)
+{
+        uint32_t loads_pending = job->resolve & ~job->cleared;
+
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
+                if (!(loads_pending & bit))
+                        continue;
+
+                struct pipe_surface *psurf = job->cbufs[i];
+                if (!psurf || (V3D_VERSION < 40 &&
+                               psurf->texture->nr_samples <= 1)) {
+                        continue;
+                }
+
+                load_general(cl, psurf, RENDER_TARGET_0 + i,
+                             bit, &loads_pending);
+        }
+
+        if ((loads_pending & PIPE_CLEAR_DEPTHSTENCIL) &&
+            (V3D_VERSION >= 40 ||
+             (job->zsbuf && job->zsbuf->texture->nr_samples > 1))) {
+                struct vc5_resource *rsc = vc5_resource(job->zsbuf->texture);
+
+                if (rsc->separate_stencil &&
+                    (loads_pending & PIPE_CLEAR_STENCIL)) {
+                        load_general(cl, job->zsbuf,
+                                     STENCIL,
+                                     PIPE_CLEAR_STENCIL,
+                                     &loads_pending);
+                }
+
+                if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
+                        load_general(cl, job->zsbuf,
+                                     zs_buffer_from_pipe_bits(loads_pending),
+                                     loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
+                                     &loads_pending);
+                }
+        }
+
+#if V3D_VERSION < 40
+        /* The initial reload will be queued until we get the
+         * tile coordinates.
+         */
+        if (loads_pending) {
+                cl_emit(cl, RELOAD_TILE_COLOUR_BUFFER, load) {
+                        load.disable_colour_buffer_load =
+                                (~loads_pending &
+                                 PIPE_CLEAR_COLOR_BUFFERS) >>
+                                PIPE_FIRST_COLOR_BUFFER_BIT;
+                        load.enable_z_load =
+                                loads_pending & PIPE_CLEAR_DEPTH;
+                        load.enable_stencil_load =
+                                loads_pending & PIPE_CLEAR_STENCIL;
+                }
+        }
+#else /* V3D_VERSION >= 40 */
+        assert(!loads_pending);
+        cl_emit(cl, END_OF_LOADS, end);
+#endif
+}
+
+static void
+vc5_rcl_emit_stores(struct vc5_job *job, struct vc5_cl *cl)
+{
+        MAYBE_UNUSED bool needs_color_clear = job->cleared & PIPE_CLEAR_COLOR_BUFFERS;
+        MAYBE_UNUSED bool needs_z_clear = job->cleared & PIPE_CLEAR_DEPTH;
+        MAYBE_UNUSED bool needs_s_clear = job->cleared & PIPE_CLEAR_STENCIL;
+
+        /* For clearing color in a TLB general on V3D 3.3:
+         *
+         * - NONE buffer store clears all TLB color buffers.
+         * - color buffer store clears just the TLB color buffer being stored.
+         * - Z/S buffers store may not clear the TLB color buffer.
+         *
+         * And on V3D 4.1, we only have one flag for "clear the buffer being
+         * stored" in the general packet, and a separate packet to clear all
+         * color TLB buffers.
+         *
+         * As a result, we only bother flagging TLB color clears in a general
+         * packet when we don't have to emit a separate packet to clear all
+         * TLB color buffers.
+         */
+        bool general_color_clear = (needs_color_clear &&
+                                    (job->cleared & PIPE_CLEAR_COLOR_BUFFERS) ==
+                                    (job->resolve & PIPE_CLEAR_COLOR_BUFFERS));
+
+        uint32_t stores_pending = job->resolve;
+
+        /* For V3D 4.1, use general stores for all TLB stores.
+         *
+         * For V3D 3.3, we only use general stores to do raw stores for any
+         * MSAA surfaces.  These output UIF tiled images where each 4x MSAA
+         * pixel is a 2x2 quad, and the format will be that of the
+         * internal_type/internal_bpp, rather than the format from GL's
+         * perspective.  Non-MSAA surfaces will use
+         * STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED.
+         */
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
+                if (!(job->resolve & bit))
+                        continue;
+
+                struct pipe_surface *psurf = job->cbufs[i];
+                if (!psurf ||
+                    (V3D_VERSION < 40 && psurf->texture->nr_samples <= 1)) {
+                        continue;
+                }
+
+                store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit,
+                              &stores_pending, general_color_clear);
+        }
+
+        if (job->resolve & PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf &&
+            !(V3D_VERSION < 40 && job->zsbuf->texture->nr_samples <= 1)) {
+                struct vc5_resource *rsc = vc5_resource(job->zsbuf->texture);
+                if (rsc->separate_stencil) {
+                        if (job->resolve & PIPE_CLEAR_DEPTH) {
+                                store_general(job, cl, job->zsbuf, Z,
+                                              PIPE_CLEAR_DEPTH,
+                                              &stores_pending,
+                                              general_color_clear);
+                        }
+
+                        if (job->resolve & PIPE_CLEAR_STENCIL) {
+                                store_general(job, cl, job->zsbuf, STENCIL,
+                                              PIPE_CLEAR_STENCIL,
+                                              &stores_pending,
+                                              general_color_clear);
+                        }
+                } else {
+                        store_general(job, cl, job->zsbuf,
+                                      zs_buffer_from_pipe_bits(job->resolve),
+                                      job->resolve & PIPE_CLEAR_DEPTHSTENCIL,
+                                      &stores_pending, general_color_clear);
+                }
+        }
+
+        if (stores_pending) {
+#if V3D_VERSION < 40
+                cl_emit(cl, STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED, store) {
+
+                        store.disable_color_buffer_write =
+                                (~stores_pending >>
+                                 PIPE_FIRST_COLOR_BUFFER_BIT) & 0xf;
+                        store.enable_z_write = stores_pending & PIPE_CLEAR_DEPTH;
+                        store.enable_stencil_write = stores_pending & PIPE_CLEAR_STENCIL;
+
+                        /* Note that when set this will clear all of the color
+                         * buffers.
+                         */
+                        store.disable_colour_buffers_clear_on_write =
+                                !needs_color_clear;
+                        store.disable_z_buffer_clear_on_write =
+                                !needs_z_clear;
+                        store.disable_stencil_buffer_clear_on_write =
+                                !needs_s_clear;
+                };
+#else /* V3D_VERSION >= 40 */
+                unreachable("All color buffers should have been stored.");
+#endif /* V3D_VERSION >= 40 */
+        } else if (needs_color_clear && !general_color_clear) {
+                /* If we didn't do our color clears in the general packet,
+                 * then emit a packet to clear all the TLB color buffers now.
+                 */
+#if V3D_VERSION < 40
+                cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+                        store.buffer_to_store = NONE;
+                }
+#else /* V3D_VERSION >= 40 */
+                cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+                        clear.clear_all_render_targets = true;
+                }
+#endif /* V3D_VERSION >= 40 */
+        }
+}
+
+static void
+vc5_rcl_emit_generic_per_tile_list(struct vc5_job *job, int last_cbuf)
+{
+        /* Emit the generic list in our indirect state -- the rcl will just
+         * have pointers into it.
+         */
+        struct vc5_cl *cl = &job->indirect;
+        vc5_cl_ensure_space(cl, 200, 1);
+        struct vc5_cl_reloc tile_list_start = cl_get_address(cl);
+
+        if (V3D_VERSION >= 40) {
+                /* V3D 4.x only requires a single tile coordinates, and
+                 * END_OF_LOADS switches us between loading and rendering.
+                 */
+                cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+        }
+
+        vc5_rcl_emit_loads(job, cl);
+
+        if (V3D_VERSION < 40) {
+                /* Tile Coordinates triggers the last reload and sets where
+                 * the stores go. There must be one per store packet.
+                 */
+                cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+        }
+
+        /* The binner starts out writing tiles assuming that the initial mode
+         * is triangles, so make sure that's the case.
+         */
+        cl_emit(cl, PRIMITIVE_LIST_FORMAT, fmt) {
+                fmt.data_type = LIST_INDEXED;
+                fmt.primitive_type = LIST_TRIANGLES;
+        }
+
+        cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+        vc5_rcl_emit_stores(job, cl);
+
+#if V3D_VERSION >= 40
+        cl_emit(cl, END_OF_TILE_MARKER, end);
+#endif
+
+        cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+        cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+                branch.start = tile_list_start;
+                branch.end = cl_get_address(cl);
+        }
+}
+
+#if V3D_VERSION >= 40
+static void
+v3d_setup_render_target(struct vc5_job *job, int cbuf,
+                        uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
+{
+        if (!job->cbufs[cbuf])
+                return;
+
+        struct vc5_surface *surf = vc5_surface(job->cbufs[cbuf]);
+        *rt_bpp = surf->internal_bpp;
+        *rt_type = surf->internal_type;
+        *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+}
+
+#else /* V3D_VERSION < 40 */
+
+static void
+v3d_emit_z_stencil_config(struct vc5_job *job, struct vc5_surface *surf,
+                          struct vc5_resource *rsc, bool is_separate_stencil)
+{
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CONFIG, zs) {
+                zs.address = cl_address(rsc->bo, surf->offset);
+
+                if (!is_separate_stencil) {
+                        zs.internal_type = surf->internal_type;
+                        zs.output_image_format = surf->format;
+                } else {
+                        zs.z_stencil_id = 1; /* Separate stencil */
+                }
+
+                zs.padded_height_of_output_image_in_uif_blocks =
+                        surf->padded_height_of_output_image_in_uif_blocks;
+
+                assert(surf->tiling != VC5_TILING_RASTER);
+                zs.memory_format = surf->tiling;
+        }
+
+        if (job->resolve & (is_separate_stencil ?
+                            PIPE_CLEAR_STENCIL :
+                            PIPE_CLEAR_DEPTHSTENCIL)) {
+                rsc->writes++;
+        }
+}
+#endif /* V3D_VERSION < 40 */
+
+#define div_round_up(a, b) (((a) + (b) - 1) / b)
+
+void
+v3dX(emit_rcl)(struct vc5_job *job)
+{
+        /* The RCL list should be empty. */
+        assert(!job->rcl.bo);
+
+        vc5_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+                                        cl_packet_length(SUPERTILE_COORDINATES));
+        job->submit.rcl_start = job->rcl.bo->offset;
+        vc5_job_add_bo(job, job->rcl.bo);
+
+        int nr_cbufs = 0;
+        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                if (job->cbufs[i])
+                        nr_cbufs = i + 1;
+        }
+
+        /* Comon config must be the first TILE_RENDERING_MODE_CONFIGURATION
+         * and Z_STENCIL_CLEAR_VALUES must be last.  The ones in between are
+         * optional updates to the previous HW state.
+         */
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_COMMON_CONFIGURATION,
+                config) {
+#if V3D_VERSION < 40
+                config.enable_z_store = job->resolve & PIPE_CLEAR_DEPTH;
+                config.enable_stencil_store = job->resolve & PIPE_CLEAR_STENCIL;
+#else /* V3D_VERSION >= 40 */
+                if (job->zsbuf) {
+                        struct vc5_surface *surf = vc5_surface(job->zsbuf);
+                        config.internal_depth_type = surf->internal_type;
+                }
+#endif /* V3D_VERSION >= 40 */
+
+                /* XXX: Early D/S clear */
+
+                switch (job->first_ez_state) {
+                case VC5_EZ_UNDECIDED:
+                case VC5_EZ_LT_LE:
+                        config.early_z_disable = false;
+                        config.early_z_test_and_update_direction =
+                                EARLY_Z_DIRECTION_LT_LE;
+                        break;
+                case VC5_EZ_GT_GE:
+                        config.early_z_disable = false;
+                        config.early_z_test_and_update_direction =
+                                EARLY_Z_DIRECTION_GT_GE;
+                        break;
+                case VC5_EZ_DISABLED:
+                        config.early_z_disable = true;
+                }
+
+                config.image_width_pixels = job->draw_width;
+                config.image_height_pixels = job->draw_height;
+
+                config.number_of_render_targets_minus_1 =
+                        MAX2(nr_cbufs, 1) - 1;
+
+                config.multisample_mode_4x = job->msaa;
+
+                config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+        }
+
+        for (int i = 0; i < nr_cbufs; i++) {
+                struct pipe_surface *psurf = job->cbufs[i];
+                if (!psurf)
+                        continue;
+                struct vc5_surface *surf = vc5_surface(psurf);
+                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+
+                MAYBE_UNUSED uint32_t config_pad = 0;
+                uint32_t clear_pad = 0;
+
+                /* XXX: Set the pad for raster. */
+                if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
+                    surf->tiling == VC5_TILING_UIF_XOR) {
+                        int uif_block_height = vc5_utile_height(rsc->cpp) * 2;
+                        uint32_t implicit_padded_height = (align(job->draw_height, uif_block_height) /
+                                                           uif_block_height);
+                        if (surf->padded_height_of_output_image_in_uif_blocks -
+                            implicit_padded_height < 15) {
+                                config_pad = (surf->padded_height_of_output_image_in_uif_blocks -
+                                              implicit_padded_height);
+                        } else {
+                                config_pad = 15;
+                                clear_pad = surf->padded_height_of_output_image_in_uif_blocks;
+                        }
+                }
+
+#if V3D_VERSION < 40
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+                        rt.address = cl_address(rsc->bo, surf->offset);
+                        rt.internal_type = surf->internal_type;
+                        rt.output_image_format = surf->format;
+                        rt.memory_format = surf->tiling;
+                        rt.internal_bpp = surf->internal_bpp;
+                        rt.render_target_number = i;
+                        rt.pad = config_pad;
+
+                        if (job->resolve & PIPE_CLEAR_COLOR0 << i)
+                                rsc->writes++;
+                }
+#endif /* V3D_VERSION < 40 */
+
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART1,
+                        clear) {
+                        clear.clear_color_low_32_bits = job->clear_color[i][0];
+                        clear.clear_color_next_24_bits = job->clear_color[i][1] & 0xffffff;
+                        clear.render_target_number = i;
+                };
+
+                if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART2,
+                                clear) {
+                                clear.clear_color_mid_low_32_bits =
+                                        ((job->clear_color[i][1] >> 24) |
+                                         (job->clear_color[i][2] << 8));
+                                clear.clear_color_mid_high_24_bits =
+                                        ((job->clear_color[i][2] >> 24) |
+                                         ((job->clear_color[i][3] & 0xffff) << 8));
+                                clear.render_target_number = i;
+                        };
+                }
+
+                if (surf->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART3,
+                                clear) {
+                                clear.uif_padded_height_in_uif_blocks = clear_pad;
+                                clear.clear_color_high_16_bits = job->clear_color[i][3] >> 16;
+                                clear.render_target_number = i;
+                        };
+                }
+        }
+
+#if V3D_VERSION >= 40
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+                v3d_setup_render_target(job, 0,
+                                        &rt.render_target_0_internal_bpp,
+                                        &rt.render_target_0_internal_type,
+                                        &rt.render_target_0_clamp);
+                v3d_setup_render_target(job, 1,
+                                        &rt.render_target_1_internal_bpp,
+                                        &rt.render_target_1_internal_type,
+                                        &rt.render_target_1_clamp);
+                v3d_setup_render_target(job, 2,
+                                        &rt.render_target_2_internal_bpp,
+                                        &rt.render_target_2_internal_type,
+                                        &rt.render_target_2_clamp);
+                v3d_setup_render_target(job, 3,
+                                        &rt.render_target_3_internal_bpp,
+                                        &rt.render_target_3_internal_type,
+                                        &rt.render_target_3_clamp);
+        }
+#endif
+
+#if V3D_VERSION < 40
+        /* TODO: Don't bother emitting if we don't load/clear Z/S. */
+        if (job->zsbuf) {
+                struct pipe_surface *psurf = job->zsbuf;
+                struct vc5_surface *surf = vc5_surface(psurf);
+                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+
+                v3d_emit_z_stencil_config(job, surf, rsc, false);
+
+                /* Emit the separate stencil packet if we have a resource for
+                 * it.  The HW will only load/store this buffer if the
+                 * Z/Stencil config doesn't have stencil in its format.
+                 */
+                if (surf->separate_stencil) {
+                        v3d_emit_z_stencil_config(job,
+                                                  vc5_surface(surf->separate_stencil),
+                                                  rsc->separate_stencil, true);
+                }
+        }
+#endif /* V3D_VERSION < 40 */
+
+        /* Ends rendering mode config. */
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CLEAR_VALUES,
+                clear) {
+                clear.z_clear_value = job->clear_z;
+                clear.stencil_vg_mask_clear_value = job->clear_s;
+        };
+
+        /* Always set initial block size before the first branch, which needs
+         * to match the value from binning mode config.
+         */
+        cl_emit(&job->rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
+                init.use_auto_chained_tile_lists = true;
+                init.size_of_first_block_in_chained_tile_lists =
+                        TILE_ALLOCATION_BLOCK_SIZE_64B;
+        }
+
+        uint32_t supertile_w = 1, supertile_h = 1;
+
+        /* If doing multicore binning, we would need to initialize each core's
+         * tile list here.
+         */
+        cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+                list.address = cl_address(job->tile_alloc, 0);
+        }
+
+        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CONFIGURATION, config) {
+                uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
+                const uint32_t max_supertiles = 256;
+
+                /* Size up our supertiles until we get under the limit. */
+                for (;;) {
+                        frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
+                                                             supertile_w);
+                        frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
+                                                             supertile_h);
+                        if (frame_w_in_supertiles * frame_h_in_supertiles <
+                            max_supertiles) {
+                                break;
+                        }
+
+                        if (supertile_w < supertile_h)
+                                supertile_w++;
+                        else
+                                supertile_h++;
+                }
+
+                config.total_frame_width_in_tiles = job->draw_tiles_x;
+                config.total_frame_height_in_tiles = job->draw_tiles_y;
+
+                config.supertile_width_in_tiles_minus_1 = supertile_w - 1;
+                config.supertile_height_in_tiles_minus_1 = supertile_h - 1;
+
+                config.total_frame_width_in_supertiles = frame_w_in_supertiles;
+                config.total_frame_height_in_supertiles = frame_h_in_supertiles;
+        }
+
+        /* Start by clearing the tile buffer. */
+        cl_emit(&job->rcl, TILE_COORDINATES, coords) {
+                coords.tile_column_number = 0;
+                coords.tile_row_number = 0;
+        }
+
+        /* Emit an initial clear of the tile buffers.  This is necessary for
+         * any buffers that should be cleared (since clearing normally happens
+         * at the *end* of the generic tile list), but it's also nice to clear
+         * everything so the first tile doesn't inherit any contents from some
+         * previous frame.
+         *
+         * Also, implement the GFXH-1742 workaround.  There's a race in the HW
+         * between the RCL updating the TLB's internal type/size and the
+         * spawning of the QPU instances using the TLB's current internal
+         * type/size.  To make sure the QPUs get the right state,, we need 1
+         * dummy store in between internal type/size changes on V3D 3.x, and 2
+         * dummy stores on 4.x.
+         */
+#if V3D_VERSION < 40
+        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                store.buffer_to_store = NONE;
+        }
+#else
+        for (int i = 0; i < 2; i++) {
+                if (i > 0)
+                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
+                cl_emit(&job->rcl, END_OF_LOADS, end);
+                cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                        store.buffer_to_store = NONE;
+                }
+                if (i == 0) {
+                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                clear.clear_z_stencil_buffer = true;
+                                clear.clear_all_render_targets = true;
+                        }
+                }
+                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
+        }
+#endif
+
+        cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+
+        vc5_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
+
+        cl_emit(&job->rcl, WAIT_ON_SEMAPHORE, sem);
+
+        /* XXX: Use Morton order */
+        uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
+        uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
+        uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
+        uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
+
+        uint32_t max_x_supertile = 0;
+        uint32_t max_y_supertile = 0;
+        if (job->draw_max_x != 0 && job->draw_max_y != 0) {
+                max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
+                max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
+        }
+
+        for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+                for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+                        cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
+                                coords.column_number_in_supertiles = x;
+                                coords.row_number_in_supertiles = y;
+                        }
+                }
+        }
+
+        cl_emit(&job->rcl, END_OF_RENDERING, end);
+}
diff --git a/src/gallium/drivers/v3d/v3dx_simulator.c b/src/gallium/drivers/v3d/v3dx_simulator.c
new file mode 100644
index 00000000000..ee8b6f2b9fd
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_simulator.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc5_simulator_hw.c
+ *
+ * Implements the actual HW interaction betweeh the GL driver's VC5 simulator and the simulator.
+ *
+ * The register headers between V3D versions will have conflicting defines, so
+ * all register interactions appear in this file and are compiled per V3D version
+ * we support.
+ */
+
+#ifdef USE_V3D_SIMULATOR
+
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_simulator_wrapper.h"
+
+#define HW_REGISTER_RO(x) (x)
+#define HW_REGISTER_RW(x) (x)
+#if V3D_VERSION >= 41
+#include "libs/core/v3d/registers/4.1.34.0/v3d.h"
+#else
+#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+#endif
+
+#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
+#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
+
+static void
+vc5_flush_l3(struct v3d_hw *v3d)
+{
+        if (!v3d_hw_has_gca(v3d))
+                return;
+
+#if V3D_VERSION < 40
+        uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
+
+        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
+        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
+#endif
+}
+
+/* Invalidates the L2 cache.  This is a read-only cache. */
+static void
+vc5_flush_l2(struct v3d_hw *v3d)
+{
+        V3D_WRITE(V3D_CTL_0_L2CACTL,
+                  V3D_CTL_0_L2CACTL_L2CCLR_SET |
+                  V3D_CTL_0_L2CACTL_L2CENA_SET);
+}
+
+/* Invalidates texture L2 cachelines */
+static void
+vc5_flush_l2t(struct v3d_hw *v3d)
+{
+        V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
+        V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
+        V3D_WRITE(V3D_CTL_0_L2TCACTL,
+                  V3D_CTL_0_L2TCACTL_L2TFLS_SET |
+                  (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+}
+
+/* Invalidates the slice caches.  These are read-only caches. */
+static void
+vc5_flush_slices(struct v3d_hw *v3d)
+{
+        V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
+}
+
+static void
+vc5_flush_caches(struct v3d_hw *v3d)
+{
+        vc5_flush_l3(v3d);
+        vc5_flush_l2(v3d);
+        vc5_flush_l2t(v3d);
+        vc5_flush_slices(v3d);
+}
+
+int
+v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
+                                struct drm_v3d_get_param *args)
+{
+        static const uint32_t reg_map[] = {
+                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
+        };
+
+        if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
+                args->value = V3D_READ(reg_map[args->param]);
+                return 0;
+        }
+
+        fprintf(stderr, "Unknown DRM_IOCTL_VC5_GET_PARAM(%lld)\n",
+                (long long)args->value);
+        abort();
+}
+
+void
+v3dX(simulator_init_regs)(struct v3d_hw *v3d)
+{
+#if V3D_VERSION == 33
+        /* Set OVRTMUOUT to match kernel behavior.
+         *
+         * This means that the texture sampler uniform configuration's tmu
+         * output type field is used, instead of using the hardware default
+         * behavior based on the texture type.  If you want the default
+         * behavior, you can still put "2" in the indirect texture state's
+         * output_type field.
+         */
+        V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
+#endif
+}
+
+void
+v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_v3d_submit_cl *submit,
+                      uint32_t gmp_ofs)
+{
+        /* Completely reset the GMP. */
+        V3D_WRITE(V3D_GMP_0_CFG,
+                  V3D_GMP_0_CFG_PROTENABLE_SET);
+        V3D_WRITE(V3D_GMP_0_TABLE_ADDR, gmp_ofs);
+        V3D_WRITE(V3D_GMP_0_CLEAR_LOAD, ~0);
+        while (V3D_READ(V3D_GMP_0_STATUS) &
+               V3D_GMP_0_STATUS_CFG_BUSY_SET) {
+                ;
+        }
+
+        vc5_flush_caches(v3d);
+
+        if (submit->qma) {
+                V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
+                V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
+        }
+#if V3D_VERSION >= 41
+        if (submit->qts) {
+                V3D_WRITE(V3D_CLE_0_CT0QTS,
+                          V3D_CLE_0_CT0QTS_CTQTSEN_SET |
+                          submit->qts);
+        }
+#endif
+        V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
+        V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
+
+        /* Wait for bin to complete before firing render, as it seems the
+         * simulator doesn't implement the semaphores.
+         */
+        while (V3D_READ(V3D_CLE_0_CT0CA) !=
+               V3D_READ(V3D_CLE_0_CT0EA)) {
+                v3d_hw_tick(v3d);
+        }
+
+        V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
+        V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
+
+        while (V3D_READ(V3D_CLE_0_CT1CA) !=
+               V3D_READ(V3D_CLE_0_CT1EA) ||
+               V3D_READ(V3D_CLE_1_CT1CA) !=
+               V3D_READ(V3D_CLE_1_CT1EA)) {
+                v3d_hw_tick(v3d);
+        }
+}
+
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
new file mode 100644
index 00000000000..e992796a218
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -0,0 +1,951 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ * Copyright (C) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_framebuffer.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_half.h"
+#include "util/u_helpers.h"
+
+#include "v3d_context.h"
+#include "v3d_tiling.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+static void *
+vc5_generic_cso_state_create(const void *src, uint32_t size)
+{
+        void *dst = calloc(1, size);
+        if (!dst)
+                return NULL;
+        memcpy(dst, src, size);
+        return dst;
+}
+
+static void
+vc5_generic_cso_state_delete(struct pipe_context *pctx, void *hwcso)
+{
+        free(hwcso);
+}
+
+static void
+vc5_set_blend_color(struct pipe_context *pctx,
+                    const struct pipe_blend_color *blend_color)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->blend_color.f = *blend_color;
+        for (int i = 0; i < 4; i++) {
+                vc5->blend_color.hf[i] =
+                        util_float_to_half(blend_color->color[i]);
+        }
+        vc5->dirty |= VC5_DIRTY_BLEND_COLOR;
+}
+
+static void
+vc5_set_stencil_ref(struct pipe_context *pctx,
+                    const struct pipe_stencil_ref *stencil_ref)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->stencil_ref = *stencil_ref;
+        vc5->dirty |= VC5_DIRTY_STENCIL_REF;
+}
+
+static void
+vc5_set_clip_state(struct pipe_context *pctx,
+                   const struct pipe_clip_state *clip)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->clip = *clip;
+        vc5->dirty |= VC5_DIRTY_CLIP;
+}
+
+static void
+vc5_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->sample_mask = sample_mask & ((1 << VC5_MAX_SAMPLES) - 1);
+        vc5->dirty |= VC5_DIRTY_SAMPLE_MASK;
+}
+
+static uint16_t
+float_to_187_half(float f)
+{
+        return fui(f) >> 16;
+}
+
+static void *
+vc5_create_rasterizer_state(struct pipe_context *pctx,
+                            const struct pipe_rasterizer_state *cso)
+{
+        struct vc5_rasterizer_state *so;
+
+        so = CALLOC_STRUCT(vc5_rasterizer_state);
+        if (!so)
+                return NULL;
+
+        so->base = *cso;
+
+        /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
+         * BCM21553).
+         */
+        so->point_size = MAX2(cso->point_size, .125f);
+
+        if (cso->offset_tri) {
+                so->offset_units = float_to_187_half(cso->offset_units);
+                so->offset_factor = float_to_187_half(cso->offset_scale);
+        }
+
+        return so;
+}
+
+/* Blend state is baked into shaders. */
+static void *
+vc5_create_blend_state(struct pipe_context *pctx,
+                       const struct pipe_blend_state *cso)
+{
+        return vc5_generic_cso_state_create(cso, sizeof(*cso));
+}
+
+static uint32_t
+translate_stencil_op(enum pipe_stencil_op op)
+{
+        switch (op) {
+        case PIPE_STENCIL_OP_KEEP:      return V3D_STENCIL_OP_KEEP;
+        case PIPE_STENCIL_OP_ZERO:      return V3D_STENCIL_OP_ZERO;
+        case PIPE_STENCIL_OP_REPLACE:   return V3D_STENCIL_OP_REPLACE;
+        case PIPE_STENCIL_OP_INCR:      return V3D_STENCIL_OP_INCR;
+        case PIPE_STENCIL_OP_DECR:      return V3D_STENCIL_OP_DECR;
+        case PIPE_STENCIL_OP_INCR_WRAP: return V3D_STENCIL_OP_INCWRAP;
+        case PIPE_STENCIL_OP_DECR_WRAP: return V3D_STENCIL_OP_DECWRAP;
+        case PIPE_STENCIL_OP_INVERT:    return V3D_STENCIL_OP_INVERT;
+        }
+        unreachable("bad stencil op");
+}
+
+static void *
+vc5_create_depth_stencil_alpha_state(struct pipe_context *pctx,
+                                     const struct pipe_depth_stencil_alpha_state *cso)
+{
+        struct vc5_depth_stencil_alpha_state *so;
+
+        so = CALLOC_STRUCT(vc5_depth_stencil_alpha_state);
+        if (!so)
+                return NULL;
+
+        so->base = *cso;
+
+        if (cso->depth.enabled) {
+                switch (cso->depth.func) {
+                case PIPE_FUNC_LESS:
+                case PIPE_FUNC_LEQUAL:
+                        so->ez_state = VC5_EZ_LT_LE;
+                        break;
+                case PIPE_FUNC_GREATER:
+                case PIPE_FUNC_GEQUAL:
+                        so->ez_state = VC5_EZ_GT_GE;
+                        break;
+                case PIPE_FUNC_NEVER:
+                case PIPE_FUNC_EQUAL:
+                        so->ez_state = VC5_EZ_UNDECIDED;
+                        break;
+                default:
+                        so->ez_state = VC5_EZ_DISABLED;
+                        break;
+                }
+
+                /* If stencil is enabled and it's not a no-op, then it would
+                 * break EZ updates.
+                 */
+                if (cso->stencil[0].enabled &&
+                    (cso->stencil[0].zfail_op != PIPE_STENCIL_OP_KEEP ||
+                     cso->stencil[0].func != PIPE_FUNC_ALWAYS ||
+                     (cso->stencil[1].enabled &&
+                      (cso->stencil[1].zfail_op != PIPE_STENCIL_OP_KEEP &&
+                       cso->stencil[1].func != PIPE_FUNC_ALWAYS)))) {
+                        so->ez_state = VC5_EZ_DISABLED;
+                }
+        }
+
+        const struct pipe_stencil_state *front = &cso->stencil[0];
+        const struct pipe_stencil_state *back = &cso->stencil[1];
+
+        if (front->enabled) {
+                v3dx_pack(&so->stencil_front, STENCIL_CONFIG, config) {
+                        config.front_config = true;
+                        /* If !back->enabled, then the front values should be
+                         * used for both front and back-facing primitives.
+                         */
+                        config.back_config = !back->enabled;
+
+                        config.stencil_write_mask = front->writemask;
+                        config.stencil_test_mask = front->valuemask;
+
+                        config.stencil_test_function = front->func;
+                        config.stencil_pass_op =
+                                translate_stencil_op(front->zpass_op);
+                        config.depth_test_fail_op =
+                                translate_stencil_op(front->zfail_op);
+                        config.stencil_test_fail_op =
+                                translate_stencil_op(front->fail_op);
+                }
+        }
+        if (back->enabled) {
+                v3dx_pack(&so->stencil_back, STENCIL_CONFIG, config) {
+                        config.front_config = false;
+                        config.back_config = true;
+
+                        config.stencil_write_mask = back->writemask;
+                        config.stencil_test_mask = back->valuemask;
+
+                        config.stencil_test_function = back->func;
+                        config.stencil_pass_op =
+                                translate_stencil_op(back->zpass_op);
+                        config.depth_test_fail_op =
+                                translate_stencil_op(back->zfail_op);
+                        config.stencil_test_fail_op =
+                                translate_stencil_op(back->fail_op);
+                }
+        }
+
+        return so;
+}
+
+static void
+vc5_set_polygon_stipple(struct pipe_context *pctx,
+                        const struct pipe_poly_stipple *stipple)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->stipple = *stipple;
+        vc5->dirty |= VC5_DIRTY_STIPPLE;
+}
+
+static void
+vc5_set_scissor_states(struct pipe_context *pctx,
+                       unsigned start_slot,
+                       unsigned num_scissors,
+                       const struct pipe_scissor_state *scissor)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+
+        vc5->scissor = *scissor;
+        vc5->dirty |= VC5_DIRTY_SCISSOR;
+}
+
+static void
+vc5_set_viewport_states(struct pipe_context *pctx,
+                        unsigned start_slot,
+                        unsigned num_viewports,
+                        const struct pipe_viewport_state *viewport)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->viewport = *viewport;
+        vc5->dirty |= VC5_DIRTY_VIEWPORT;
+}
+
+static void
+vc5_set_vertex_buffers(struct pipe_context *pctx,
+                       unsigned start_slot, unsigned count,
+                       const struct pipe_vertex_buffer *vb)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_vertexbuf_stateobj *so = &vc5->vertexbuf;
+
+        util_set_vertex_buffers_mask(so->vb, &so->enabled_mask, vb,
+                                     start_slot, count);
+        so->count = util_last_bit(so->enabled_mask);
+
+        vc5->dirty |= VC5_DIRTY_VTXBUF;
+}
+
+static void
+vc5_blend_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->blend = hwcso;
+        vc5->dirty |= VC5_DIRTY_BLEND;
+}
+
+static void
+vc5_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->rasterizer = hwcso;
+        vc5->dirty |= VC5_DIRTY_RASTERIZER;
+}
+
+static void
+vc5_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->zsa = hwcso;
+        vc5->dirty |= VC5_DIRTY_ZSA;
+}
+
+static void *
+vc5_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+                        const struct pipe_vertex_element *elements)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_vertex_stateobj *so = CALLOC_STRUCT(vc5_vertex_stateobj);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
+        so->num_elements = num_elements;
+
+        for (int i = 0; i < so->num_elements; i++) {
+                const struct pipe_vertex_element *elem = &elements[i];
+                const struct util_format_description *desc =
+                        util_format_description(elem->src_format);
+                uint32_t r_size = desc->channel[0].size;
+
+                const uint32_t size =
+                        cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
+
+                v3dx_pack(&so->attrs[i * size],
+                          GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+                        /* vec_size == 0 means 4 */
+                        attr.vec_size = desc->nr_channels & 3;
+                        attr.signed_int_type = (desc->channel[0].type ==
+                                                UTIL_FORMAT_TYPE_SIGNED);
+
+                        attr.normalized_int_type = desc->channel[0].normalized;
+                        attr.read_as_int_uint = desc->channel[0].pure_integer;
+                        attr.instance_divisor = MIN2(elem->instance_divisor,
+                                                     0xffff);
+
+                        switch (desc->channel[0].type) {
+                        case UTIL_FORMAT_TYPE_FLOAT:
+                                if (r_size == 32) {
+                                        attr.type = ATTRIBUTE_FLOAT;
+                                } else {
+                                        assert(r_size == 16);
+                                        attr.type = ATTRIBUTE_HALF_FLOAT;
+                                }
+                                break;
+
+                        case UTIL_FORMAT_TYPE_SIGNED:
+                        case UTIL_FORMAT_TYPE_UNSIGNED:
+                                switch (r_size) {
+                                case 32:
+                                        attr.type = ATTRIBUTE_INT;
+                                        break;
+                                case 16:
+                                        attr.type = ATTRIBUTE_SHORT;
+                                        break;
+                                case 10:
+                                        attr.type = ATTRIBUTE_INT2_10_10_10;
+                                        break;
+                                case 8:
+                                        attr.type = ATTRIBUTE_BYTE;
+                                        break;
+                                default:
+                                        fprintf(stderr,
+                                                "format %s unsupported\n",
+                                                desc->name);
+                                        attr.type = ATTRIBUTE_BYTE;
+                                        abort();
+                                }
+                                break;
+
+                        default:
+                                fprintf(stderr,
+                                        "format %s unsupported\n",
+                                        desc->name);
+                                abort();
+                        }
+                }
+        }
+
+        /* Set up the default attribute values in case any of the vertex
+         * elements use them.
+         */
+        so->default_attribute_values = vc5_bo_alloc(vc5->screen,
+                                                    VC5_MAX_ATTRIBUTES *
+                                                    4 * sizeof(float),
+                                                    "default attributes");
+        uint32_t *attrs = vc5_bo_map(so->default_attribute_values);
+        for (int i = 0; i < VC5_MAX_ATTRIBUTES; i++) {
+                attrs[i * 4 + 0] = 0;
+                attrs[i * 4 + 1] = 0;
+                attrs[i * 4 + 2] = 0;
+                if (i < so->num_elements &&
+                    util_format_is_pure_integer(so->pipe[i].src_format)) {
+                        attrs[i * 4 + 3] = 1;
+                } else {
+                        attrs[i * 4 + 3] = fui(1.0);
+                }
+        }
+
+        return so;
+}
+
+static void
+vc5_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        vc5->vtx = hwcso;
+        vc5->dirty |= VC5_DIRTY_VTXSTATE;
+}
+
+static void
+vc5_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
+                        const struct pipe_constant_buffer *cb)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_constbuf_stateobj *so = &vc5->constbuf[shader];
+
+        util_copy_constant_buffer(&so->cb[index], cb);
+
+        /* Note that the state tracker can unbind constant buffers by
+         * passing NULL here.
+         */
+        if (unlikely(!cb)) {
+                so->enabled_mask &= ~(1 << index);
+                so->dirty_mask &= ~(1 << index);
+                return;
+        }
+
+        so->enabled_mask |= 1 << index;
+        so->dirty_mask |= 1 << index;
+        vc5->dirty |= VC5_DIRTY_CONSTBUF;
+}
+
+static void
+vc5_set_framebuffer_state(struct pipe_context *pctx,
+                          const struct pipe_framebuffer_state *framebuffer)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct pipe_framebuffer_state *cso = &vc5->framebuffer;
+
+        vc5->job = NULL;
+
+        util_copy_framebuffer_state(cso, framebuffer);
+
+        vc5->swap_color_rb = 0;
+        vc5->blend_dst_alpha_one = 0;
+        for (int i = 0; i < vc5->framebuffer.nr_cbufs; i++) {
+                struct pipe_surface *cbuf = vc5->framebuffer.cbufs[i];
+                if (!cbuf)
+                        continue;
+
+                const struct util_format_description *desc =
+                        util_format_description(cbuf->format);
+
+                /* For BGRA8 formats (DRI window system default format), we
+                 * need to swap R and B, since the HW's format is RGBA8.
+                 */
+                if (desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+                    cbuf->format != PIPE_FORMAT_B5G6R5_UNORM) {
+                        vc5->swap_color_rb |= 1 << i;
+                }
+
+                if (desc->swizzle[3] == PIPE_SWIZZLE_1)
+                        vc5->blend_dst_alpha_one |= 1 << i;
+        }
+
+        vc5->dirty |= VC5_DIRTY_FRAMEBUFFER;
+}
+
+static struct vc5_texture_stateobj *
+vc5_get_stage_tex(struct vc5_context *vc5, enum pipe_shader_type shader)
+{
+        switch (shader) {
+        case PIPE_SHADER_FRAGMENT:
+                vc5->dirty |= VC5_DIRTY_FRAGTEX;
+                return &vc5->fragtex;
+                break;
+        case PIPE_SHADER_VERTEX:
+                vc5->dirty |= VC5_DIRTY_VERTTEX;
+                return &vc5->verttex;
+                break;
+        default:
+                fprintf(stderr, "Unknown shader target %d\n", shader);
+                abort();
+        }
+}
+
+static uint32_t translate_wrap(uint32_t pipe_wrap, bool using_nearest)
+{
+        switch (pipe_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                unreachable("Unknown wrap mode");
+        }
+}
+
+
+static void *
+vc5_create_sampler_state(struct pipe_context *pctx,
+                         const struct pipe_sampler_state *cso)
+{
+        MAYBE_UNUSED struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_sampler_state *so = CALLOC_STRUCT(vc5_sampler_state);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so, cso, sizeof(*cso));
+
+        bool either_nearest =
+                (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+
+#if V3D_VERSION >= 40
+        so->bo = vc5_bo_alloc(vc5->screen, cl_packet_length(SAMPLER_STATE),
+                              "sampler");
+        void *map = vc5_bo_map(so->bo);
+
+        v3dx_pack(map, SAMPLER_STATE, sampler) {
+                sampler.wrap_i_border = false;
+
+                sampler.wrap_s = translate_wrap(cso->wrap_s, either_nearest);
+                sampler.wrap_t = translate_wrap(cso->wrap_t, either_nearest);
+                sampler.wrap_r = translate_wrap(cso->wrap_r, either_nearest);
+
+                sampler.fixed_bias = cso->lod_bias;
+                sampler.depth_compare_function = cso->compare_func;
+
+                sampler.min_filter_nearest =
+                        cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
+                sampler.mag_filter_nearest =
+                        cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
+                sampler.mip_filter_nearest =
+                        cso->min_mip_filter != PIPE_TEX_MIPFILTER_LINEAR;
+
+                sampler.min_level_of_detail = MIN2(MAX2(0, cso->min_lod),
+                                                   15);
+                sampler.max_level_of_detail = MIN2(cso->max_lod, 15);
+
+                if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+                        sampler.min_level_of_detail = 0;
+                        sampler.max_level_of_detail = 0;
+                }
+
+                if (cso->max_anisotropy) {
+                        sampler.anisotropy_enable = true;
+
+                        if (cso->max_anisotropy > 8)
+                                sampler.maximum_anisotropy = 3;
+                        else if (cso->max_anisotropy > 4)
+                                sampler.maximum_anisotropy = 2;
+                        else if (cso->max_anisotropy > 2)
+                                sampler.maximum_anisotropy = 1;
+                }
+
+                sampler.border_colour_mode = V3D_BORDER_COLOUR_FOLLOWS;
+                /* XXX: The border colour field is in the TMU blending format
+                 * (32, f16, or i16), and we need to customize it based on
+                 * that.
+                 *
+                 * XXX: for compat alpha formats, we need the alpha field to
+                 * be in the red channel.
+                 */
+                sampler.border_colour_red =
+                        util_float_to_half(cso->border_color.f[0]);
+                sampler.border_colour_green =
+                        util_float_to_half(cso->border_color.f[1]);
+                sampler.border_colour_blue =
+                        util_float_to_half(cso->border_color.f[2]);
+                sampler.border_colour_alpha =
+                        util_float_to_half(cso->border_color.f[3]);
+        }
+
+#else /* V3D_VERSION < 40 */
+        v3dx_pack(&so->p0, TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1, p0) {
+                p0.s_wrap_mode = translate_wrap(cso->wrap_s, either_nearest);
+                p0.t_wrap_mode = translate_wrap(cso->wrap_t, either_nearest);
+                p0.r_wrap_mode = translate_wrap(cso->wrap_r, either_nearest);
+        }
+
+        v3dx_pack(&so->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
+                tex.depth_compare_function = cso->compare_func;
+                tex.fixed_bias = cso->lod_bias;
+        }
+#endif /* V3D_VERSION < 40 */
+        return so;
+}
+
+static void
+vc5_sampler_states_bind(struct pipe_context *pctx,
+                        enum pipe_shader_type shader, unsigned start,
+                        unsigned nr, void **hwcso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_texture_stateobj *stage_tex = vc5_get_stage_tex(vc5, shader);
+
+        assert(start == 0);
+        unsigned i;
+        unsigned new_nr = 0;
+
+        for (i = 0; i < nr; i++) {
+                if (hwcso[i])
+                        new_nr = i + 1;
+                stage_tex->samplers[i] = hwcso[i];
+        }
+
+        for (; i < stage_tex->num_samplers; i++) {
+                stage_tex->samplers[i] = NULL;
+        }
+
+        stage_tex->num_samplers = new_nr;
+}
+
+static void
+vc5_sampler_state_delete(struct pipe_context *pctx,
+                         void *hwcso)
+{
+        struct pipe_sampler_state *psampler = hwcso;
+        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+
+        vc5_bo_unreference(&sampler->bo);
+        free(psampler);
+}
+
+#if V3D_VERSION >= 40
+static uint32_t
+translate_swizzle(unsigned char pipe_swizzle)
+{
+        switch (pipe_swizzle) {
+        case PIPE_SWIZZLE_0:
+                return 0;
+        case PIPE_SWIZZLE_1:
+                return 1;
+        case PIPE_SWIZZLE_X:
+        case PIPE_SWIZZLE_Y:
+        case PIPE_SWIZZLE_Z:
+        case PIPE_SWIZZLE_W:
+                return 2 + pipe_swizzle;
+        default:
+                unreachable("unknown swizzle");
+        }
+}
+#endif
+
+static struct pipe_sampler_view *
+vc5_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
+                        const struct pipe_sampler_view *cso)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_screen *screen = vc5->screen;
+        struct vc5_sampler_view *so = CALLOC_STRUCT(vc5_sampler_view);
+        struct vc5_resource *rsc = vc5_resource(prsc);
+
+        if (!so)
+                return NULL;
+
+        so->base = *cso;
+
+        pipe_reference(NULL, &prsc->reference);
+
+        /* Compute the sampler view's swizzle up front. This will be plugged
+         * into either the sampler (for 16-bit returns) or the shader's
+         * texture key (for 32)
+         */
+        uint8_t view_swizzle[4] = {
+                cso->swizzle_r,
+                cso->swizzle_g,
+                cso->swizzle_b,
+                cso->swizzle_a
+        };
+        const uint8_t *fmt_swizzle =
+                vc5_get_format_swizzle(&screen->devinfo, so->base.format);
+        util_format_compose_swizzles(fmt_swizzle, view_swizzle, so->swizzle);
+
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
+
+        int msaa_scale = prsc->nr_samples > 1 ? 2 : 1;
+
+#if V3D_VERSION >= 40
+        so->bo = vc5_bo_alloc(vc5->screen, cl_packet_length(SAMPLER_STATE),
+                              "sampler");
+        void *map = vc5_bo_map(so->bo);
+
+        v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
+#else /* V3D_VERSION < 40 */
+        v3dx_pack(&so->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
+#endif
+
+                tex.image_width = prsc->width0 * msaa_scale;
+                tex.image_height = prsc->height0 * msaa_scale;
+
+#if V3D_VERSION >= 40
+                /* On 4.x, the height of a 1D texture is redefined to be the
+                 * upper 14 bits of the width (which is only usable with txf).
+                 */
+                if (prsc->target == PIPE_TEXTURE_1D ||
+                    prsc->target == PIPE_TEXTURE_1D_ARRAY) {
+                        tex.image_height = tex.image_width >> 14;
+                }
+#endif
+
+                if (prsc->target == PIPE_TEXTURE_3D) {
+                        tex.image_depth = prsc->depth0;
+                } else {
+                        tex.image_depth = (cso->u.tex.last_layer -
+                                           cso->u.tex.first_layer) + 1;
+                }
+
+                tex.srgb = util_format_is_srgb(cso->format);
+
+                tex.base_level = cso->u.tex.first_level;
+#if V3D_VERSION >= 40
+                tex.max_level = cso->u.tex.last_level;
+                /* Note that we don't have a job to reference the texture's sBO
+                 * at state create time, so any time this sampler view is used
+                 * we need to add the texture to the job.
+                 */
+                tex.texture_base_pointer = cl_address(NULL,
+                                                      rsc->bo->offset +
+                                                      rsc->slices[0].offset),
+
+                tex.swizzle_r = translate_swizzle(so->swizzle[0]);
+                tex.swizzle_g = translate_swizzle(so->swizzle[1]);
+                tex.swizzle_b = translate_swizzle(so->swizzle[2]);
+                tex.swizzle_a = translate_swizzle(so->swizzle[3]);
+#endif
+                tex.array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
+
+                if (prsc->nr_samples > 1 && V3D_VERSION < 40) {
+                        /* Using texture views to reinterpret formats on our
+                         * MSAA textures won't work, because we don't lay out
+                         * the bits in memory as it's expected -- for example,
+                         * RGBA8 and RGB10_A2 are compatible in the
+                         * ARB_texture_view spec, but in HW we lay them out as
+                         * 32bpp RGBA8 and 64bpp RGBA16F.  Just assert for now
+                         * to catch failures.
+                         *
+                         * We explicitly allow remapping S8Z24 to RGBA8888 for
+                         * vc5_blit.c's stencil blits.
+                         */
+                        assert((util_format_linear(cso->format) ==
+                                util_format_linear(prsc->format)) ||
+                               (prsc->format == PIPE_FORMAT_S8_UINT_Z24_UNORM &&
+                                cso->format == PIPE_FORMAT_R8G8B8A8_UNORM));
+                        uint32_t output_image_format =
+                                vc5_get_rt_format(&screen->devinfo, cso->format);
+                        uint32_t internal_type;
+                        uint32_t internal_bpp;
+                        vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                                                                    output_image_format,
+                                                                    &internal_type,
+                                                                    &internal_bpp);
+
+                        switch (internal_type) {
+                        case V3D_INTERNAL_TYPE_8:
+                                tex.texture_type = TEXTURE_DATA_FORMAT_RGBA8;
+                                break;
+                        case V3D_INTERNAL_TYPE_16F:
+                                tex.texture_type = TEXTURE_DATA_FORMAT_RGBA16F;
+                                break;
+                        default:
+                                unreachable("Bad MSAA texture type");
+                        }
+
+                        /* sRGB was stored in the tile buffer as linear and
+                         * would have been encoded to sRGB on resolved tile
+                         * buffer store.  Note that this means we would need
+                         * shader code if we wanted to read an MSAA sRGB
+                         * texture without sRGB decode.
+                         */
+                        tex.srgb = false;
+                } else {
+                        tex.texture_type = vc5_get_tex_format(&screen->devinfo,
+                                                              cso->format);
+                }
+
+                /* Since other platform devices may produce UIF images even
+                 * when they're not big enough for V3D to assume they're UIF,
+                 * we force images with level 0 as UIF to be always treated
+                 * that way.
+                 */
+                tex.level_0_is_strictly_uif = (rsc->slices[0].tiling ==
+                                               VC5_TILING_UIF_XOR ||
+                                               rsc->slices[0].tiling ==
+                                               VC5_TILING_UIF_NO_XOR);
+                tex.level_0_xor_enable = (rsc->slices[0].tiling ==
+                                          VC5_TILING_UIF_XOR);
+
+                if (tex.level_0_is_strictly_uif)
+                        tex.level_0_ub_pad = rsc->slices[0].ub_pad;
+
+#if V3D_VERSION >= 40
+                if (tex.uif_xor_disable ||
+                    tex.level_0_is_strictly_uif) {
+                        tex.extended = true;
+                }
+#endif /* V3D_VERSION >= 40 */
+        };
+
+        return &so->base;
+}
+
+static void
+vc5_sampler_view_destroy(struct pipe_context *pctx,
+                         struct pipe_sampler_view *psview)
+{
+        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+
+        vc5_bo_unreference(&sview->bo);
+        pipe_resource_reference(&psview->texture, NULL);
+        free(psview);
+}
+
+static void
+vc5_set_sampler_views(struct pipe_context *pctx,
+                      enum pipe_shader_type shader,
+                      unsigned start, unsigned nr,
+                      struct pipe_sampler_view **views)
+{
+        struct vc5_context *vc5 = vc5_context(pctx);
+        struct vc5_texture_stateobj *stage_tex = vc5_get_stage_tex(vc5, shader);
+        unsigned i;
+        unsigned new_nr = 0;
+
+        assert(start == 0);
+
+        for (i = 0; i < nr; i++) {
+                if (views[i])
+                        new_nr = i + 1;
+                pipe_sampler_view_reference(&stage_tex->textures[i], views[i]);
+        }
+
+        for (; i < stage_tex->num_textures; i++) {
+                pipe_sampler_view_reference(&stage_tex->textures[i], NULL);
+        }
+
+        stage_tex->num_textures = new_nr;
+}
+
+static struct pipe_stream_output_target *
+vc5_create_stream_output_target(struct pipe_context *pctx,
+                                struct pipe_resource *prsc,
+                                unsigned buffer_offset,
+                                unsigned buffer_size)
+{
+        struct pipe_stream_output_target *target;
+
+        target = CALLOC_STRUCT(pipe_stream_output_target);
+        if (!target)
+                return NULL;
+
+        pipe_reference_init(&target->reference, 1);
+        pipe_resource_reference(&target->buffer, prsc);
+
+        target->context = pctx;
+        target->buffer_offset = buffer_offset;
+        target->buffer_size = buffer_size;
+
+        return target;
+}
+
+static void
+vc5_stream_output_target_destroy(struct pipe_context *pctx,
+                                 struct pipe_stream_output_target *target)
+{
+        pipe_resource_reference(&target->buffer, NULL);
+        free(target);
+}
+
+static void
+vc5_set_stream_output_targets(struct pipe_context *pctx,
+                              unsigned num_targets,
+                              struct pipe_stream_output_target **targets,
+                              const unsigned *offsets)
+{
+        struct vc5_context *ctx = vc5_context(pctx);
+        struct vc5_streamout_stateobj *so = &ctx->streamout;
+        unsigned i;
+
+        assert(num_targets <= ARRAY_SIZE(so->targets));
+
+        for (i = 0; i < num_targets; i++)
+                pipe_so_target_reference(&so->targets[i], targets[i]);
+
+        for (; i < so->num_targets; i++)
+                pipe_so_target_reference(&so->targets[i], NULL);
+
+        so->num_targets = num_targets;
+
+        ctx->dirty |= VC5_DIRTY_STREAMOUT;
+}
+
+void
+v3dX(state_init)(struct pipe_context *pctx)
+{
+        pctx->set_blend_color = vc5_set_blend_color;
+        pctx->set_stencil_ref = vc5_set_stencil_ref;
+        pctx->set_clip_state = vc5_set_clip_state;
+        pctx->set_sample_mask = vc5_set_sample_mask;
+        pctx->set_constant_buffer = vc5_set_constant_buffer;
+        pctx->set_framebuffer_state = vc5_set_framebuffer_state;
+        pctx->set_polygon_stipple = vc5_set_polygon_stipple;
+        pctx->set_scissor_states = vc5_set_scissor_states;
+        pctx->set_viewport_states = vc5_set_viewport_states;
+
+        pctx->set_vertex_buffers = vc5_set_vertex_buffers;
+
+        pctx->create_blend_state = vc5_create_blend_state;
+        pctx->bind_blend_state = vc5_blend_state_bind;
+        pctx->delete_blend_state = vc5_generic_cso_state_delete;
+
+        pctx->create_rasterizer_state = vc5_create_rasterizer_state;
+        pctx->bind_rasterizer_state = vc5_rasterizer_state_bind;
+        pctx->delete_rasterizer_state = vc5_generic_cso_state_delete;
+
+        pctx->create_depth_stencil_alpha_state = vc5_create_depth_stencil_alpha_state;
+        pctx->bind_depth_stencil_alpha_state = vc5_zsa_state_bind;
+        pctx->delete_depth_stencil_alpha_state = vc5_generic_cso_state_delete;
+
+        pctx->create_vertex_elements_state = vc5_vertex_state_create;
+        pctx->delete_vertex_elements_state = vc5_generic_cso_state_delete;
+        pctx->bind_vertex_elements_state = vc5_vertex_state_bind;
+
+        pctx->create_sampler_state = vc5_create_sampler_state;
+        pctx->delete_sampler_state = vc5_sampler_state_delete;
+        pctx->bind_sampler_states = vc5_sampler_states_bind;
+
+        pctx->create_sampler_view = vc5_create_sampler_view;
+        pctx->sampler_view_destroy = vc5_sampler_view_destroy;
+        pctx->set_sampler_views = vc5_set_sampler_views;
+
+        pctx->create_stream_output_target = vc5_create_stream_output_target;
+        pctx->stream_output_target_destroy = vc5_stream_output_target_destroy;
+        pctx->set_stream_output_targets = vc5_set_stream_output_targets;
+}
author	Eric Anholt <[email protected]>	2018-05-01 12:24:48 -0700
committer	Eric Anholt <[email protected]>	2018-05-16 21:19:07 +0100
commit	8c47ebbd232704ab048eab2572e2b2a44f38957a (patch)
tree	8946780fc424b3aa39e0b32ac875047605770a49 /src/gallium/drivers/v3d
parent	c4c488a2aeb24c0f468664c0cacd0d01111a4e46 (diff)