summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c161
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_cp.c9
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_legalize.c9
3 files changed, 177 insertions, 2 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 0d642772f9e..640805a4f68 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1432,6 +1432,149 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
return atomic;
}
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *ldl, *offset;
+ unsigned base;
+
+ offset = get_src(ctx, &intr->src[0])[0];
+ base = intr->const_index[0];
+
+ ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+ ldl->cat6.src_offset = base;
+ ldl->cat6.type = TYPE_U32;
+ ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+ mark_read(ctx, ldl);
+
+ split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *stl, *offset;
+ struct ir3_instruction * const *value;
+ unsigned base, wrmask;
+
+ value = get_src(ctx, &intr->src[0]);
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ base = intr->const_index[0];
+ wrmask = intr->const_index[1];
+
+ /* Combine groups of consecutive enabled channels in one write
+ * message. We use ffs to find the first enabled channel and then ffs on
+ * the bit-inverse, down-shifted writemask to determine the length of
+ * the block of enabled bits.
+ *
+ * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+ */
+ while (wrmask) {
+ unsigned first_component = ffs(wrmask) - 1;
+ unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+ stl = ir3_STL(b, offset, 0,
+ create_collect(b, &value[first_component], length), 0,
+ create_immed(b, length), 0);
+ stl->cat6.dst_offset = first_component + base;
+ stl->cat6.type = TYPE_U32;
+
+ mark_write(ctx, stl);
+ array_insert(b, b->keeps, stl);
+
+ /* Clear the bits in the writemask that we just wrote, then try
+ * again to see if more channels are left.
+ */
+ wrmask &= (15 << (first_component + length));
+ }
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ * operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ * in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *atomic, *src0, *src1;
+ type_t type = TYPE_U32;
+
+ src0 = get_src(ctx, &intr->src[0])[0]; /* offset */
+ src1 = get_src(ctx, &intr->src[1])[0]; /* value */
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_shared_atomic_add:
+ atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_imin:
+ atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_shared_atomic_umin:
+ atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_imax:
+ atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_shared_atomic_umax:
+ atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_and:
+ atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_or:
+ atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_xor:
+ atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_comp_swap:
+ /* for cmpxchg, src1 is [ui]vec2(data, compare): */
+ src1 = create_collect(b, (struct ir3_instruction*[]){
+ get_src(ctx, &intr->src[2])[0],
+ src1,
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.d = 1;
+ atomic->cat6.type = type;
+ mark_write(ctx, atomic);
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+
+ return atomic;
+}
+
static void
emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
@@ -1586,6 +1729,24 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_ssbo_atomic_comp_swap:
dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
break;
+ case nir_intrinsic_load_shared:
+ emit_intrinsic_load_shared(ctx, intr, dst);
+ break;
+ case nir_intrinsic_store_shared:
+ emit_intrinsic_store_shared(ctx, intr);
+ break;
+ case nir_intrinsic_shared_atomic_add:
+ case nir_intrinsic_shared_atomic_imin:
+ case nir_intrinsic_shared_atomic_umin:
+ case nir_intrinsic_shared_atomic_imax:
+ case nir_intrinsic_shared_atomic_umax:
+ case nir_intrinsic_shared_atomic_and:
+ case nir_intrinsic_shared_atomic_or:
+ case nir_intrinsic_shared_atomic_xor:
+ case nir_intrinsic_shared_atomic_exchange:
+ case nir_intrinsic_shared_atomic_comp_swap:
+ dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+ break;
case nir_intrinsic_barrier:
case nir_intrinsic_memory_barrier:
case nir_intrinsic_group_memory_barrier:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 8c907eb5a53..61b4b201215 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -194,11 +194,20 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
if (is_store(instr) && (n == 1))
return false;
+ if ((instr->opc == OPC_LDL) && (n != 1))
+ return false;
+
+ if ((instr->opc == OPC_STL) && (n != 2))
+ return false;
+
/* disallow CP into anything but the SSBO slot argument for
* atomics:
*/
if (is_atomic(instr->opc) && (n != 0))
return false;
+
+ if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+ return false;
}
break;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index d6850eb12a0..a206837ef84 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -191,13 +191,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
/* seems like ldlv needs (ss) bit instead?? which is odd but
* makes a bunch of flat-varying tests start working on a4xx.
*/
- if (n->opc == OPC_LDLV)
+ if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
regmask_set(&needs_ss, n->regs[0]);
else
regmask_set(&needs_sy, n->regs[0]);
+ } else if (is_atomic(n->opc)) {
+ if (n->flags & IR3_INSTR_G)
+ regmask_set(&needs_sy, n->regs[0]);
+ else
+ regmask_set(&needs_ss, n->regs[0]);
}
- if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc))
+ if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
ctx->has_ssbo = true;
/* both tex/sfu appear to not always immediately consume