aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2017-11-09 10:57:55 -0500
committerRob Clark <[email protected]>2017-11-12 12:28:59 -0500
commit9edfc369c04d131b664f6c94a0e249a81a5c0da5 (patch)
treef691703325622560e3ed26a5847fdaf7cde6686f
parenteaae81058cdd0ed103c55be7a1722546d63c86da (diff)
freedreno/ir3: image support
Signed-off-by: Rob Clark <[email protected]>
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c273
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_legalize.c3
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_nir.c9
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_shader.c37
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_shader.h15
5 files changed, 337 insertions, 0 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 640805a4f68..bd3e0d0cd4a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -254,6 +254,12 @@ compile_init(struct ir3_compiler *compiler,
constoff += align(cnt, 4) / 4;
}
+ if (so->const_layout.image_dims.count > 0) {
+ unsigned cnt = so->const_layout.image_dims.count;
+ so->constbase.image_dims = constoff;
+ constoff += align(cnt, 4) / 4;
+ }
+
unsigned num_driver_params = 0;
if (so->type == SHADER_VERTEX) {
num_driver_params = IR3_DP_VS_COUNT;
@@ -1575,6 +1581,254 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
return atomic;
}
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load). To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc. This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same. This shouldn't be hard-
+ * coded. Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, const nir_variable *var)
+{
+ /* TODO figure out real limit per generation, and don't hardcode: */
+ const unsigned max_samplers = 16;
+ return max_samplers - var->data.driver_location - 1;
+}
+
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+ switch (glsl_get_sampler_dim(glsl_without_array(var->type))) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_BUF:
+ return 1;
+ break;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_RECT:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ case GLSL_SAMPLER_DIM_MS:
+ return 2;
+ case GLSL_SAMPLER_DIM_3D:
+ case GLSL_SAMPLER_DIM_CUBE:
+ return 3;
+ default:
+ unreachable("bad sampler dim");
+ return 0;
+ }
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+ switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+ case GLSL_TYPE_UINT:
+ return TYPE_U32;
+ case GLSL_TYPE_INT:
+ return TYPE_S32;
+ case GLSL_TYPE_FLOAT:
+ return TYPE_F32;
+ default:
+ unreachable("bad sampler type.");
+ return 0;
+ }
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+ struct ir3_instruction * const *coords, bool byteoff)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *offset;
+ unsigned ncoords = get_image_coords(var);
+
+ /* to calculate the byte offset (yes, uggg) we need (up to) three
+ * const values to know the bytes per pixel, and y and z stride:
+ */
+ unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+ ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+ debug_assert(ctx->so->const_layout.image_dims.mask &
+ (1 << var->data.driver_location));
+
+ /* offset = coords.x * bytes_per_pixel: */
+ offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+ if (ncoords > 1) {
+ /* offset += coords.y * y_pitch: */
+ offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+ coords[1], 0, offset, 0);
+ }
+ if (ncoords > 2) {
+ /* offset += coords.z * z_pitch: */
+ offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+ coords[2], 0, offset, 0);
+ }
+
+ if (!byteoff) {
+ /* Some cases, like atomics, seem to use dword offset instead
+ * of byte offsets.. blob just puts an extra shr.b in there
+ * in those cases:
+ */
+ offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ }
+
+ return create_collect(b, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+}
+
+/* src[] = { coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = intr->variables[0]->var;
+ struct ir3_instruction *sam;
+ struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+ unsigned ncoords = get_image_coords(var);
+ unsigned tex_idx = get_image_slot(ctx, var);
+ type_t type = get_image_type(var);
+ unsigned flags = 0;
+
+ if (ncoords == 3)
+ flags |= IR3_INSTR_3D;
+
+ sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags,
+ tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL);
+
+ split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = intr->variables[0]->var;
+ struct ir3_instruction *stib, *offset;
+ struct ir3_instruction * const *value = get_src(ctx, &intr->src[2]);
+ struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+ unsigned ncoords = get_image_coords(var);
+ unsigned tex_idx = get_image_slot(ctx, var);
+
+ /* src0 is value
+ * src1 is coords
+ * src2 is 64b byte offset
+ */
+
+ offset = get_image_offset(ctx, var, coords, true);
+
+ /* NOTE: stib seems to take byte offset, but stgb.typed can be used
+ * too and takes a dword offset.. not quite sure yet why blob uses
+ * one over the other in various cases.
+ */
+
+ stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+ create_collect(b, value, 4), 0,
+ create_collect(b, coords, ncoords), 0,
+ offset, 0);
+ stib->cat6.iim_val = 4;
+ stib->cat6.d = ncoords;
+ stib->cat6.type = get_image_type(var);
+ stib->cat6.typed = true;
+ mark_write(ctx, stib);
+
+ array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = intr->variables[0]->var;
+ unsigned ncoords = get_image_coords(var);
+ unsigned tex_idx = get_image_slot(ctx, var);
+ struct ir3_instruction *sam, *lod;
+ unsigned flags = 0;
+
+ if (ncoords == 3)
+ flags = IR3_INSTR_3D;
+
+ lod = create_immed(b, 0);
+ sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
+ tex_idx, tex_idx, lod, NULL);
+
+ split_dest(b, dst, sam, 0, ncoords);
+}
+
+/* src[] = { coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = intr->variables[0]->var;
+ struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+ struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+ unsigned ncoords = get_image_coords(var);
+
+ image = create_immed(b, get_image_slot(ctx, var));
+
+ /* src0 is value (or uvec2(value, compare))
+ * src1 is coords
+ * src2 is 64b byte offset
+ */
+ src0 = get_src(ctx, &intr->src[2])[0];
+ src1 = create_collect(b, coords, ncoords);
+ src2 = get_image_offset(ctx, var, coords, false);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_image_atomic_add:
+ atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_min:
+ atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_max:
+ atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_and:
+ atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_or:
+ atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_xor:
+ atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_atomic_comp_swap:
+ /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+ src0 = create_collect(b, (struct ir3_instruction*[]){
+ src0,
+ get_src(ctx, &intr->src[3])[0],
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.d = ncoords;
+ atomic->cat6.type = get_image_type(var);
+ atomic->cat6.typed = true;
+ mark_write(ctx, atomic);
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+
+ return atomic;
+}
+
static void
emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
@@ -1747,6 +2001,25 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_shared_atomic_comp_swap:
dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
break;
+ case nir_intrinsic_image_load:
+ emit_intrinsic_load_image(ctx, intr, dst);
+ break;
+ case nir_intrinsic_image_store:
+ emit_intrinsic_store_image(ctx, intr);
+ break;
+ case nir_intrinsic_image_size:
+ emit_intrinsic_image_size(ctx, intr, dst);
+ break;
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+ break;
case nir_intrinsic_barrier:
case nir_intrinsic_memory_barrier:
case nir_intrinsic_group_memory_barrier:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index a206837ef84..3f12b68ada1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -187,6 +187,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
*/
ctx->has_samp = true;
regmask_set(&needs_sy, n->regs[0]);
+ } else if (n->opc == OPC_RESINFO) {
+ regmask_set(&needs_ss, n->regs[0]);
+ ir3_NOP(block)->flags |= IR3_INSTR_SS;
} else if (is_load(n)) {
/* seems like ldlv needs (ss) bit instead?? which is odd but
* makes a bunch of flat-varying tests start working on a4xx.
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 19d05b462e5..7dd24e5f4ee 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -237,6 +237,15 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
layout->ssbo_size.count;
layout->ssbo_size.count += 1; /* one const per */
break;
+ case nir_intrinsic_image_store:
+ idx = intr->variables[0]->var->data.driver_location;
+ if (layout->image_dims.mask & (1 << idx))
+ break;
+ layout->image_dims.mask |= (1 << idx);
+ layout->ssbo_size.off[idx] =
+ layout->image_dims.count;
+ layout->image_dims.count += 3; /* three const per */
+ break;
default:
break;
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 26f291de894..61a336ed7dd 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -628,6 +628,38 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
}
static void
+emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
+ struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si)
+{
+ uint32_t offset = v->constbase.image_dims;
+ if (v->constlen > offset) {
+ uint32_t dims[align(v->const_layout.image_dims.count, 4)];
+ unsigned mask = v->const_layout.image_dims.mask;
+
+ while (mask) {
+ struct pipe_image_view *img;
+ struct fd_resource *rsc;
+ unsigned index = u_bit_scan(&mask);
+ unsigned off = v->const_layout.image_dims.off[index];
+
+ img = &si->si[index];
+ rsc = fd_resource(img->resource);
+
+ dims[off + 0] = rsc->cpp;
+ if (img->resource->target != PIPE_BUFFER) {
+ unsigned lvl = img->u.tex.level;
+ dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp;
+ dims[off + 2] = rsc->slices[lvl].size0;
+ }
+ }
+
+ fd_wfi(ctx->batch, ring);
+ ctx->emit_const(ring, v->type, offset * 4,
+ 0, ARRAY_SIZE(dims), dims, NULL);
+ }
+}
+
+static void
emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
struct fd_ringbuffer *ring)
{
@@ -752,6 +784,11 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t];
emit_ssbo_sizes(ctx, v, ring, sb);
}
+
+ if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) {
+ struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t];
+ emit_image_dims(ctx, v, ring, si);
+ }
}
void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index dd68e69d16c..3886cce5571 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -63,6 +63,9 @@ enum ir3_driver_param {
*
* + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
* for a given SSBO
+ *
+ * + Image dimensions: needed to calculate pixel offset, but only for
+ * images that have a image_store intrinsic
*/
struct ir3_driver_const_layout {
struct {
@@ -74,6 +77,17 @@ struct ir3_driver_const_layout {
*/
uint32_t off[PIPE_MAX_SHADER_BUFFERS];
} ssbo_size;
+
+ struct {
+ uint32_t mask; /* bitmask of images that have image_store */
+ uint32_t count; /* number of consts allocated */
+ /* three const allocated per image which has image_store:
+ * + cpp (bytes per pixel)
+ * + pitch (y pitch)
+ * + array_pitch (z pitch)
+ */
+ uint32_t off[PIPE_MAX_SHADER_IMAGES];
+ } image_dims;
};
/* Configuration key used to identify a shader variant.. different
@@ -295,6 +309,7 @@ struct ir3_shader_variant {
unsigned ubo;
/* NOTE that a3xx might need a section for SSBO addresses too */
unsigned ssbo_sizes;
+ unsigned image_dims;
unsigned driver_param;
unsigned tfbo;
unsigned immediate;