freedreno/ir3: Add intrinsics that map to LDLW/STLW

These intrinsics will let us do all the offset calculations in nir, which is nicer to work with and lets nir_opt_algebraic eat it all up. Signed-off-by: Kristian H. Kristensen <[email protected]>
author: Kristian H. Kristensen <[email protected]> 2019-10-10 15:15:37 -0700
committer: Kristian H. Kristensen <[email protected]> 2019-10-17 13:43:53 -0700
commit: 0324706764b9d0a1a6a6c1af13fc7cfb01500d80 (patch)
tree: 81b4ecc6c121604486e17f4342a37a25d7b26739 /src
parent: 436d125adfab09d2c31835fc9dca49c308039c06 (diff)
2 files changed, 83 insertions, 0 deletions
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 40430b32849..ae62a85d39b 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -771,6 +771,14 @@ intrinsic("ssbo_atomic_xor_ir3",        src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_exchange_ir3",   src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_comp_swap_ir3",  src_comp=[1, 1, 1, 1, 1], dest_comp=1)
 
+# IR3-specific load/store intrinsics. These access a buffer used to pass data
+# between geometry stages - perhaps it's explicit access to the vertex cache.
+
+# src[] = { value, offset }.
+store("shared_ir3", 2, [BASE, WRMASK, ALIGN_MUL, ALIGN_OFFSET])
+# src[] = { offset }.
+load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
+
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 118e7781c6a..2cc720652fc 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -843,6 +843,75 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	}
 }
 
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *load, *offset;
+	unsigned base;
+
+	offset = ir3_get_src(ctx, &intr->src[0])[0];
+	base   = nir_intrinsic_base(intr);
+
+	load = ir3_LDLW(b, offset, 0,
+			create_immed(b, intr->num_components), 0,
+			create_immed(b, base), 0);
+
+	load->cat6.type = utype_dst(intr->dest);
+	load->regs[0]->wrmask = MASK(intr->num_components);
+
+	load->barrier_class = IR3_BARRIER_SHARED_R;
+	load->barrier_conflict = IR3_BARRIER_SHARED_W;
+
+	ir3_split_dest(b, dst, load, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *store, *offset;
+	struct ir3_instruction * const *value;
+	unsigned base, wrmask;
+
+	value  = ir3_get_src(ctx, &intr->src[0]);
+	offset = ir3_get_src(ctx, &intr->src[1])[0];
+
+	base   = nir_intrinsic_base(intr);
+	wrmask = nir_intrinsic_write_mask(intr);
+
+	/* Combine groups of consecutive enabled channels in one write
+	 * message. We use ffs to find the first enabled channel and then ffs on
+	 * the bit-inverse, down-shifted writemask to determine the length of
+	 * the block of enabled bits.
+	 *
+	 * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+	 */
+	while (wrmask) {
+		unsigned first_component = ffs(wrmask) - 1;
+		unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+		store = ir3_STLW(b, offset, 0,
+			ir3_create_collect(ctx, &value[first_component], length), 0,
+			create_immed(b, length), 0);
+
+		store->cat6.dst_offset = first_component + base;
+		store->cat6.type = utype_src(intr->src[0]);
+		store->barrier_class = IR3_BARRIER_SHARED_W;
+		store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+		array_insert(b, b->keeps, store);
+
+		/* Clear the bits in the writemask that we just wrote, then try
+		 * again to see if more channels are left.
+		 */
+		wrmask &= (15 << (first_component + length));
+	}
+}
+
 /*
  * CS shared variable atomic intrinsics
  *
@@ -1582,6 +1651,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 
 		break;
 	}
+	case nir_intrinsic_load_shared_ir3:
+		emit_intrinsic_load_shared_ir3(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_shared_ir3:
+		emit_intrinsic_store_shared_ir3(ctx, intr);
+		break;
 	default:
 		ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
 				nir_intrinsic_infos[intr->intrinsic].name);
author	Kristian H. Kristensen <[email protected]>	2019-10-10 15:15:37 -0700
committer	Kristian H. Kristensen <[email protected]>	2019-10-17 13:43:53 -0700
commit	0324706764b9d0a1a6a6c1af13fc7cfb01500d80 (patch)
tree	81b4ecc6c121604486e17f4342a37a25d7b26739 /src
parent	436d125adfab09d2c31835fc9dca49c308039c06 (diff)