5 files changed, 227 insertions, 58 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 6db0a2a20cd..01a7bbc7dc6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -817,6 +817,12 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 	return new_instr;
 }
 
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+	array_insert(instr, instr->deps, dep);
+}
+
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags)
 {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 90f8e3c44d3..6ef0683ab00 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -326,6 +326,40 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;
 
+	/* Tracking for additional dependent instructions.  Used to handle
+	 * barriers, WAR hazards for arrays/SSBOs/etc.
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+	/*
+	 * From PoV of instruction scheduling, not execution (ie. ignores global/
+	 * local distinction):
+	 *                            shared  image  atomic  SSBO  everything
+	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
+	 *     groupMemoryBarrier()
+	 *   memoryBarrier()       -           R/W    R/W
+	 *     (but only images declared coherent?)
+	 *   memoryBarrierAtomic() -                  R/W
+	 *   memoryBarrierBuffer() -                          R/W
+	 *   memoryBarrierImage()  -           R/W
+	 *   memoryBarrierShared() -   R/W
+	 *
+	 * TODO I think for SSBO/image/shared, in cases where we can determine
+	 * which variable is accessed, we don't need to care about accesses to
+	 * different variables (unless declared coherent??)
+	 */
+	enum {
+		IR3_BARRIER_EVERYTHING = 1 << 0,
+		IR3_BARRIER_SHARED_R   = 1 << 1,
+		IR3_BARRIER_SHARED_W   = 1 << 2,
+		IR3_BARRIER_IMAGE_R    = 1 << 3,
+		IR3_BARRIER_IMAGE_W    = 1 << 4,
+		IR3_BARRIER_BUFFER_R   = 1 << 5,
+		IR3_BARRIER_BUFFER_W   = 1 << 6,
+		IR3_BARRIER_ARRAY_R    = 1 << 7,
+		IR3_BARRIER_ARRAY_W    = 1 << 8,
+	} barrier_class, barrier_conflict;
+
 	/* Entry in ir3_block's instruction list: */
 	struct list_head node;
 
@@ -417,16 +451,13 @@ struct ir3_array {
 
 	nir_register *r;
 
-	/* We track the last write and last access (read or write) to
-	 * setup dependencies on instructions that read or write the
-	 * array.  Reads can be re-ordered wrt. other reads, but should
-	 * not be re-ordered wrt. to writes.  Writes cannot be reordered
-	 * wrt. any other access to the array.
-	 *
-	 * So array reads depend on last write, and array writes depend
-	 * on the last access.
+	/* To avoid array write's from getting DCE'd, keep track of the
+	 * most recent write.  Any array access depends on the most
+	 * recent write.  This way, nothing depends on writes after the
+	 * last read.  But all the writes that happen before that have
+	 * something depending on them
 	 */
-	struct ir3_instruction *last_write, *last_access;
+	struct ir3_instruction *last_write;
 
 	/* extra stuff used in RA pass: */
 	unsigned base;      /* base vreg name */
@@ -493,6 +524,7 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
 		opc_t opc, int nreg);
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
 const char *ir3_instr_name(struct ir3_instruction *instr);
 
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
@@ -907,25 +939,36 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
 
 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
+	unsigned cnt = instr->regs_count + instr->deps_count;
 	if (instr->address)
-		return instr->regs_count + 1;
-	return instr->regs_count;
+		cnt++;
+	return cnt;
 }
 
 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
-	if (n == (instr->regs_count + 0))
+	if (n == (instr->regs_count + instr->deps_count))
 		return instr->address;
+	if (n >= instr->regs_count)
+		return instr->deps[n - instr->regs_count];
 	return ssa(instr->regs[n]);
 }
 
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+	if (n == (instr->regs_count + instr->deps_count))
+		return false;
+	if (n >= instr->regs_count)
+		return true;
+	return false;
+}
+
 #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
 
 /* iterator for an instruction's SSA sources (instr), also returns src #: */
 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
-	if ((__instr)->regs_count) \
-		for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
-			if ((__srcinst = __ssa_src_n(__instr, __n)))
+	for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+		if ((__srcinst = __ssa_src_n(__instr, __n)))
 
 /* iterator for an instruction's SSA sources (instr): */
 #define foreach_ssa_src(__srcinst, __instr) \
@@ -950,6 +993,7 @@ void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
 void ir3_group(struct ir3 *ir);
 
 /* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
 int ir3_sched(struct ir3 *ir);
 
 /* register assignment: */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bd3e0d0cd4a..3fd2e50d82f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -74,20 +74,6 @@ struct ir3_context {
 	/* Compute shader inputs: */
 	struct ir3_instruction *local_invocation_id, *work_group_id;
 
-	/* For SSBO's and atomics, we need to preserve order, such
-	 * that reads don't overtake writes, and the order of writes
-	 * is preserved.  Atomics are considered as a write.
-	 *
-	 * To do this, we track last write and last access, in a
-	 * similar way to ir3_array.  But since we don't know whether
-	 * the same SSBO is bound to multiple slots, so we simply
-	 * track this globally rather than per-SSBO.
-	 *
-	 * TODO should we track this per block instead?  I guess it
-	 * shouldn't matter much?
-	 */
-	struct ir3_instruction *last_write, *last_access;
-
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;
 
@@ -345,6 +331,8 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_R;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
 	ir3_reg_create(mov, 0, 0);
 	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
 			COND(address, IR3_REG_RELATIV));
@@ -356,8 +344,6 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	if (address)
 		ir3_instr_set_address(mov, address);
 
-	arr->last_access = mov;
-
 	return mov;
 }
 
@@ -373,9 +359,11 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_W;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
 	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
 			COND(address, IR3_REG_RELATIV));
-	dst->instr = arr->last_access;
+	dst->instr = arr->last_write;
 	dst->size  = arr->length;
 	dst->array.id = arr->id;
 	dst->array.offset = n;
@@ -384,7 +372,7 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	if (address)
 		ir3_instr_set_address(mov, address);
 
-	arr->last_write = arr->last_access = mov;
+	arr->last_write = mov;
 
 	return mov;
 }
@@ -1236,22 +1224,6 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 	}
 }
 
-static void
-mark_read(struct ir3_context *ctx, struct ir3_instruction *instr)
-{
-	instr->regs[0]->instr = ctx->last_write;
-	instr->regs[0]->flags |= IR3_REG_SSA;
-	ctx->last_access = instr;
-}
-
-static void
-mark_write(struct ir3_context *ctx, struct ir3_instruction *instr)
-{
-	instr->regs[0]->instr = ctx->last_access;
-	instr->regs[0]->flags |= IR3_REG_SSA;
-	ctx->last_write = ctx->last_access = instr;
-}
-
 /* src[] = { buffer_index, offset }. No const_index */
 static void
 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
@@ -1280,7 +1252,8 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 	ldgb->cat6.iim_val = intr->num_components;
 	ldgb->cat6.d = 4;
 	ldgb->cat6.type = TYPE_U32;
-	mark_read(ctx, ldgb);
+	ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+	ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
 
 	split_dest(b, dst, ldgb, 0, intr->num_components);
 }
@@ -1320,7 +1293,8 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	stgb->cat6.iim_val = ncomp;
 	stgb->cat6.d = 4;
 	stgb->cat6.type = TYPE_U32;
-	mark_write(ctx, stgb);
+	stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+	stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 
 	array_insert(b, b->keeps, stgb);
 }
@@ -1430,7 +1404,8 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	atomic->cat6.iim_val = 1;
 	atomic->cat6.d = 4;
 	atomic->cat6.type = type;
-	mark_write(ctx, atomic);
+	atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+	atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 
 	/* even if nothing consume the result, we can't DCE the instruction: */
 	array_insert(b, b->keeps, atomic);
@@ -1455,7 +1430,8 @@ emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 	ldl->cat6.type = TYPE_U32;
 	ldl->regs[0]->wrmask = MASK(intr->num_components);
 
-	mark_read(ctx, ldl);
+	ldl->barrier_class = IR3_BARRIER_SHARED_R;
+	ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
 
 	split_dest(b, dst, ldl, 0, intr->num_components);
 }
@@ -1491,8 +1467,9 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			create_immed(b, length), 0);
 		stl->cat6.dst_offset = first_component + base;
 		stl->cat6.type = TYPE_U32;
+		stl->barrier_class = IR3_BARRIER_SHARED_W;
+		stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
 
-		mark_write(ctx, stl);
 		array_insert(b, b->keeps, stl);
 
 		/* Clear the bits in the writemask that we just wrote, then try
@@ -1573,7 +1550,8 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	atomic->cat6.iim_val = 1;
 	atomic->cat6.d = 1;
 	atomic->cat6.type = type;
-	mark_write(ctx, atomic);
+	atomic->barrier_class = IR3_BARRIER_SHARED_W;
+	atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
 
 	/* even if nothing consume the result, we can't DCE the instruction: */
 	array_insert(b, b->keeps, atomic);
@@ -1702,6 +1680,9 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 	sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags,
 			tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL);
 
+	sam->barrier_class = IR3_BARRIER_IMAGE_R;
+	sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
 	split_dest(b, dst, sam, 0, 4);
 }
 
@@ -1737,7 +1718,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	stib->cat6.d = ncoords;
 	stib->cat6.type = get_image_type(var);
 	stib->cat6.typed = true;
-	mark_write(ctx, stib);
+	stib->barrier_class = IR3_BARRIER_IMAGE_W;
+	stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 
 	array_insert(b, b->keeps, stib);
 }
@@ -1821,7 +1803,8 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	atomic->cat6.d = ncoords;
 	atomic->cat6.type = get_image_type(var);
 	atomic->cat6.typed = true;
-	mark_write(ctx, atomic);
+	atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+	atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 
 	/* even if nothing consume the result, we can't DCE the instruction: */
 	array_insert(b, b->keeps, atomic);
@@ -1841,23 +1824,62 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 		barrier->cat7.g = true;
 		barrier->cat7.l = true;
 		barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+		barrier->barrier_class = IR3_BARRIER_EVERYTHING;
 		break;
 	case nir_intrinsic_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+		break;
 	case nir_intrinsic_memory_barrier_atomic_counter:
 	case nir_intrinsic_memory_barrier_buffer:
 		barrier = ir3_FENCE(b);
 		barrier->cat7.g = true;
 		barrier->cat7.r = true;
 		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+				IR3_BARRIER_BUFFER_W;
 		break;
-	case nir_intrinsic_group_memory_barrier:
 	case nir_intrinsic_memory_barrier_image:
+		// TODO double check if this should have .g set
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+		barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+				IR3_BARRIER_IMAGE_W;
+		break;
 	case nir_intrinsic_memory_barrier_shared:
 		barrier = ir3_FENCE(b);
 		barrier->cat7.g = true;
 		barrier->cat7.l = true;
 		barrier->cat7.r = true;
 		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W;
+		barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+				IR3_BARRIER_SHARED_W;
+		break;
+	case nir_intrinsic_group_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 		break;
 	default:
 		unreachable("boo");
@@ -3301,6 +3323,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		ir3_print(ir);
 	}
 
+	ir3_sched_add_deps(ir);
+
 	/* Group left/right neighbors, inserting mov's where needed to
 	 * solve conflicts:
 	 */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index be39027b6a0..55ca5333b47 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -55,6 +55,10 @@
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
+	/* don't count false-dependencies: */
+	if (__is_false_dep(consumer, n))
+		return 0;
+
 	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
 	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
 	 * handled with sync bits
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index b56da304f92..9492e9ba650 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -669,3 +669,94 @@ int ir3_sched(struct ir3 *ir)
 		return -1;
 	return 0;
 }
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+	/* TODO for dependencies that are related to a specific object, ie
+	 * a specific SSBO/image/array, we could relax this constraint to
+	 * make accesses to unrelated objects not depend on each other (at
+	 * least as long as not declared coherent)
+	 */
+	if ((instr->barrier_class & IR3_BARRIER_EVERYTHING) ||
+			(prior->barrier_class & IR3_BARRIER_EVERYTHING))
+		return true;
+	return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	struct list_head *prev = instr->node.prev;
+	struct list_head *next = instr->node.next;
+
+	/* add dependencies on previous instructions that must be scheduled
+	 * prior to the current instruction
+	 */
+	while (prev != &block->instr_list) {
+		struct ir3_instruction *pi =
+			LIST_ENTRY(struct ir3_instruction, prev, node);
+
+		prev = prev->prev;
+
+		if (is_meta(pi))
+			continue;
+
+		if (instr->barrier_class == pi->barrier_class) {
+			ir3_instr_add_dep(instr, pi);
+			break;
+		}
+
+		if (depends_on(instr, pi))
+			ir3_instr_add_dep(instr, pi);
+	}
+
+	/* add dependencies on this instruction to following instructions
+	 * that must be scheduled after the current instruction:
+	 */
+	while (next != &block->instr_list) {
+		struct ir3_instruction *ni =
+			LIST_ENTRY(struct ir3_instruction, next, node);
+
+		next = next->next;
+
+		if (is_meta(ni))
+			continue;
+
+		if (instr->barrier_class == ni->barrier_class) {
+			ir3_instr_add_dep(ni, instr);
+			break;
+		}
+
+		if (depends_on(ni, instr))
+			ir3_instr_add_dep(ni, instr);
+	}
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (instr->barrier_class) {
+			add_barrier_deps(block, instr);
+		}
+	}
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		calculate_deps(block);
+	}
+}