5 files changed, 295 insertions, 15 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 6fb40c20562..24b577ae9f3 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -21,6 +21,7 @@ C_SOURCES := \
 	vc4_job.c \
 	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
+	vc4_nir_lower_txf_ms.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
new file mode 100644
index 00000000000..54873e6186a
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "kernel/vc4_packet.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/** @file vc4_nir_lower_txf_ms.c
+ * Walks the NIR generated by TGSI-to-NIR to lower its nir_texop_txf_ms
+ * coordinates to do the math necessary and use a plain nir_texop_txf instead.
+ *
+ * MSAA textures are laid out as 32x32-aligned blocks of RGBA8888 or Z24S8.
+ * We can't load them through the normal sampler path because of the lack of
+ * linear support in the hardware.  So, we treat MSAA textures as a giant UBO
+ * and do the math in the shader.
+ */
+
+static void
+vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b,
+                           nir_tex_instr *txf_ms)
+{
+        if (txf_ms->op != nir_texop_txf_ms)
+                return;
+
+        b->cursor = nir_before_instr(&txf_ms->instr);
+
+        nir_tex_instr *txf = nir_tex_instr_create(c->s, 1);
+        txf->op = nir_texop_txf;
+        txf->sampler = txf_ms->sampler;
+        txf->sampler_index = txf_ms->sampler_index;
+        txf->coord_components = txf_ms->coord_components;
+        txf->is_shadow = txf_ms->is_shadow;
+        txf->is_new_style_shadow = txf_ms->is_new_style_shadow;
+
+        nir_ssa_def *coord = NULL, *sample_index = NULL;
+        for (int i = 0; i < txf_ms->num_srcs; i++) {
+                assert(txf_ms->src[i].src.is_ssa);
+
+                switch (txf_ms->src[i].src_type) {
+                case nir_tex_src_coord:
+                        coord = txf_ms->src[i].src.ssa;
+                        break;
+                case nir_tex_src_ms_index:
+                        sample_index = txf_ms->src[i].src.ssa;
+                        break;
+                default:
+                        unreachable("Unknown txf_ms src\n");
+                }
+        }
+        assert(coord);
+        assert(sample_index);
+
+        nir_ssa_def *x = nir_channel(b, coord, 0);
+        nir_ssa_def *y = nir_channel(b, coord, 1);
+
+        uint32_t tile_w = 32;
+        uint32_t tile_h = 32;
+        uint32_t tile_w_shift = 5;
+        uint32_t tile_h_shift = 5;
+        uint32_t tile_size = (tile_h * tile_w *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+        unsigned unit = txf_ms->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_w);
+        uint32_t w_tiles = w / tile_w;
+
+        nir_ssa_def *x_tile = nir_ushr(b, x, nir_imm_int(b, tile_w_shift));
+        nir_ssa_def *y_tile = nir_ushr(b, y, nir_imm_int(b, tile_h_shift));
+        nir_ssa_def *tile_addr = nir_iadd(b,
+                                          nir_imul(b, x_tile,
+                                                   nir_imm_int(b, tile_size)),
+                                          nir_imul(b, y_tile,
+                                                   nir_imm_int(b, (w_tiles *
+                                                                   tile_size))));
+        nir_ssa_def *x_subspan = nir_iand(b, x,
+                                          nir_imm_int(b, (tile_w - 1) & ~1));
+        nir_ssa_def *y_subspan = nir_iand(b, y,
+                                          nir_imm_int(b, (tile_h - 1) & ~1));
+        nir_ssa_def *subspan_addr = nir_iadd(b,
+                                             nir_imul(b, x_subspan,
+                                                      nir_imm_int(b, 2 * VC4_MAX_SAMPLES * sizeof(uint32_t))),
+                                             nir_imul(b, y_subspan,
+                                                      nir_imm_int(b,
+                                                                  tile_w *
+                                                                  VC4_MAX_SAMPLES *
+                                                                  sizeof(uint32_t))));
+
+        nir_ssa_def *pixel_addr = nir_ior(b,
+                                          nir_iand(b,
+                                                   nir_ishl(b, x,
+                                                            nir_imm_int(b, 2)),
+                                                   nir_imm_int(b, (1 << 2))),
+                                          nir_iand(b,
+                                                   nir_ishl(b, y,
+                                                            nir_imm_int(b, 3)),
+                                                   nir_imm_int(b, (1 << 3))));
+
+        nir_ssa_def *sample_addr = nir_ishl(b, sample_index, nir_imm_int(b, 4));
+
+        nir_ssa_def *addr = nir_iadd(b,
+                                     nir_ior(b, sample_addr, pixel_addr),
+                                     nir_iadd(b, subspan_addr, tile_addr));
+
+        txf->src[0].src_type = nir_tex_src_coord;
+        txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0)));
+        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL);
+        nir_builder_instr_insert(b, &txf->instr);
+        nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa,
+                                 nir_src_for_ssa(&txf->dest.ssa));
+        nir_instr_remove(&txf_ms->instr);
+}
+
+static bool
+vc4_nir_lower_txf_ms_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr) {
+                if (instr->type == nir_instr_type_tex) {
+                        vc4_nir_lower_txf_ms_instr(c, &b,
+                                                   nir_instr_as_tex(instr));
+                }
+        }
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_txf_ms_block, c);
+
+        nir_metadata_preserve(impl,
+                              nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_txf_ms(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_txf_ms_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index dda2d84b5b3..31968bb5db9 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -294,6 +294,76 @@ ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
                                         qir_uniform_ui(c, 24)));
 }
 
+static struct qreg
+ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
+{
+        struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
+                                                 qir_uniform_ui(c, 8)));
+        return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
+}
+
+/**
+ * Emits a lowered TXF_MS from an MSAA texture.
+ *
+ * The addressing math has been lowered in NIR, and now we just need to read
+ * it like a UBO.
+ */
+static void
+ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
+{
+        uint32_t tile_width = 32;
+        uint32_t tile_height = 32;
+        uint32_t tile_size = (tile_height * tile_width *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+
+        unsigned unit = instr->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
+        uint32_t w_tiles = w / tile_width;
+        uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
+        uint32_t h_tiles = h / tile_height;
+        uint32_t size = w_tiles * h_tiles * tile_size;
+
+        struct qreg addr;
+        assert(instr->num_srcs == 1);
+        assert(instr->src[0].src_type == nir_tex_src_coord);
+        addr = ntq_get_src(c, instr->src[0].src, 0);
+
+        /* Perform the clamping required by kernel validation. */
+        addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
+        addr = qir_MIN(c, addr,  qir_uniform_ui(c, size - 4));
+
+        qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+
+        struct qreg tex = qir_TEX_RESULT(c);
+        c->num_texture_samples++;
+
+        struct qreg texture_output[4];
+        enum pipe_format format = c->key->tex[unit].format;
+        if (util_format_is_depth_or_stencil(format)) {
+                struct qreg scaled = ntq_scale_depth_texture(c, tex);
+                for (int i = 0; i < 4; i++)
+                        texture_output[i] = scaled;
+        } else {
+                struct qreg tex_result_unpacked[4];
+                for (int i = 0; i < 4; i++)
+                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
+
+                const uint8_t *format_swiz =
+                        vc4_get_format_swizzle(c->key->tex[unit].format);
+                for (int i = 0; i < 4; i++) {
+                        texture_output[i] =
+                                get_swizzled_channel(c, tex_result_unpacked,
+                                                     format_swiz[i]);
+                }
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
+        for (int i = 0; i < 4; i++) {
+                dest[i] = get_swizzled_channel(c, texture_output,
+                                               c->key->tex[unit].swizzle[i]);
+        }
+}
+
 static void
 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 {
@@ -301,6 +371,11 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         bool is_txb = false, is_txl = false, has_proj = false;
         unsigned unit = instr->sampler_index;
 
+        if (instr->op == nir_texop_txf) {
+                ntq_emit_txf(c, instr);
+                return;
+        }
+
         for (unsigned i = 0; i < instr->num_srcs; i++) {
                 switch (instr->src[i].src_type) {
                 case nir_tex_src_coord:
@@ -396,11 +471,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
-                                                         qir_uniform_ui(c, 8)));
-                struct qreg normalized = qir_FMUL(c, depthf,
-                                                  qir_uniform_f(c, 1.0f/0xffffff));
-
+                struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
 
                 struct qreg one = qir_uniform_f(c, 1.0f);
@@ -1712,6 +1783,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                 nir_lower_clip_vs(c->s, c->key->ucp_enables);
 
         vc4_nir_lower_io(c);
+        vc4_nir_lower_txf_ms(c);
         nir_lower_idiv(c->s);
         nir_lower_load_const_to_scalar(c->s);
 
@@ -1947,12 +2019,19 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
                 struct pipe_sampler_state *sampler_state =
                         texstate->samplers[i];
 
-                if (sampler) {
-                        key->tex[i].format = sampler->format;
-                        key->tex[i].swizzle[0] = sampler->swizzle_r;
-                        key->tex[i].swizzle[1] = sampler->swizzle_g;
-                        key->tex[i].swizzle[2] = sampler->swizzle_b;
-                        key->tex[i].swizzle[3] = sampler->swizzle_a;
+                if (!sampler)
+                        continue;
+
+                key->tex[i].format = sampler->format;
+                key->tex[i].swizzle[0] = sampler->swizzle_r;
+                key->tex[i].swizzle[1] = sampler->swizzle_g;
+                key->tex[i].swizzle[2] = sampler->swizzle_b;
+                key->tex[i].swizzle[3] = sampler->swizzle_a;
+
+                if (sampler->texture->nr_samples) {
+                        key->tex[i].msaa_width = sampler->texture->width0;
+                        key->tex[i].msaa_height = sampler->texture->height0;
+                } else if (sampler){
                         key->tex[i].compare_mode = sampler_state->compare_mode;
                         key->tex[i].compare_func = sampler_state->compare_func;
                         key->tex[i].wrap_s = sampler_state->wrap_s;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 4e406d60d72..d53095ed222 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -233,6 +233,8 @@ enum quniform_contents {
         /** A reference to a texture config parameter 2 cubemap stride uniform */
         QUNIFORM_TEXTURE_CONFIG_P2,
 
+        QUNIFORM_TEXTURE_MSAA_ADDR,
+
         QUNIFORM_UBO_ADDR,
 
         QUNIFORM_TEXRECT_SCALE_X,
@@ -287,11 +289,18 @@ struct vc4_key {
         struct vc4_uncompiled_shader *shader_state;
         struct {
                 enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
                 uint8_t swizzle[4];
+                union {
+                        struct {
+                                unsigned compare_mode:1;
+                                unsigned compare_func:3;
+                                unsigned wrap_s:3;
+                                unsigned wrap_t:3;
+                        };
+                        struct {
+                                uint16_t msaa_width, msaa_height;
+                        };
+                };
         } tex[VC4_MAX_TEXTURE_SAMPLERS];
         uint8_t ucp_enables;
 };
@@ -490,6 +499,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                        enum quniform_contents contents);
 nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
                                           nir_ssa_def **srcs, int swiz);
+void vc4_nir_lower_txf_ms(struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 5dfdd73f7bd..262531f1bd7 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -71,6 +71,18 @@ write_texture_p2(struct vc4_context *vc4,
                VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
 }
 
+static void
+write_texture_msaa_addr(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                        struct vc4_texture_stateobj *texstate,
+                        uint32_t unit)
+{
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, 0);
+}
+
 
 #define SWIZ(x,y,z,w) {          \
         UTIL_FORMAT_SWIZZLE_##x, \
@@ -244,6 +256,11 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
                         break;
 
+                case QUNIFORM_TEXTURE_MSAA_ADDR:
+                        write_texture_msaa_addr(vc4, &uniforms,
+                                                texstate, uinfo->data[i]);
+                        break;
+
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
                         write_texture_border_color(vc4, &uniforms,
                                                    texstate, uinfo->data[i]);
@@ -349,6 +366,7 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                 case QUNIFORM_TEXTURE_CONFIG_P1:
                 case QUNIFORM_TEXTURE_CONFIG_P2:
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXTURE_MSAA_ADDR:
                 case QUNIFORM_TEXRECT_SCALE_X:
                 case QUNIFORM_TEXRECT_SCALE_Y:
                         dirty |= VC4_DIRTY_TEXSTATE;