summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/freedreno
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/freedreno')
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.c6
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.h74
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c110
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_depth.c4
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_sched.c91
5 files changed, 227 insertions, 58 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 6db0a2a20cd..01a7bbc7dc6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -817,6 +817,12 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
return new_instr;
}
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+ array_insert(instr, instr->deps, dep);
+}
+
struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
int num, int flags)
{
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 90f8e3c44d3..6ef0683ab00 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -326,6 +326,40 @@ struct ir3_instruction {
*/
struct ir3_instruction *address;
+ /* Tracking for additional dependent instructions. Used to handle
+ * barriers, WAR hazards for arrays/SSBOs/etc.
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+ /*
+ * From PoV of instruction scheduling, not execution (ie. ignores global/
+ * local distinction):
+ * shared image atomic SSBO everything
+ * barrier()/ - R/W R/W R/W R/W X
+ * groupMemoryBarrier()
+ * memoryBarrier() - R/W R/W
+ * (but only images declared coherent?)
+ * memoryBarrierAtomic() - R/W
+ * memoryBarrierBuffer() - R/W
+ * memoryBarrierImage() - R/W
+ * memoryBarrierShared() - R/W
+ *
+ * TODO I think for SSBO/image/shared, in cases where we can determine
+ * which variable is accessed, we don't need to care about accesses to
+ * different variables (unless declared coherent??)
+ */
+ enum {
+ IR3_BARRIER_EVERYTHING = 1 << 0,
+ IR3_BARRIER_SHARED_R = 1 << 1,
+ IR3_BARRIER_SHARED_W = 1 << 2,
+ IR3_BARRIER_IMAGE_R = 1 << 3,
+ IR3_BARRIER_IMAGE_W = 1 << 4,
+ IR3_BARRIER_BUFFER_R = 1 << 5,
+ IR3_BARRIER_BUFFER_W = 1 << 6,
+ IR3_BARRIER_ARRAY_R = 1 << 7,
+ IR3_BARRIER_ARRAY_W = 1 << 8,
+ } barrier_class, barrier_conflict;
+
/* Entry in ir3_block's instruction list: */
struct list_head node;
@@ -417,16 +451,13 @@ struct ir3_array {
nir_register *r;
- /* We track the last write and last access (read or write) to
- * setup dependencies on instructions that read or write the
- * array. Reads can be re-ordered wrt. other reads, but should
- * not be re-ordered wrt. to writes. Writes cannot be reordered
- * wrt. any other access to the array.
- *
- * So array reads depend on last write, and array writes depend
- * on the last access.
+ /* To avoid array write's from getting DCE'd, keep track of the
+ * most recent write. Any array access depends on the most
+ * recent write. This way, nothing depends on writes after the
+ * last read. But all the writes that happen before that have
+ * something depending on them
*/
- struct ir3_instruction *last_write, *last_access;
+ struct ir3_instruction *last_write;
/* extra stuff used in RA pass: */
unsigned base; /* base vreg name */
@@ -493,6 +524,7 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
opc_t opc, int nreg);
struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
const char *ir3_instr_name(struct ir3_instruction *instr);
struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
@@ -907,25 +939,36 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
{
+ unsigned cnt = instr->regs_count + instr->deps_count;
if (instr->address)
- return instr->regs_count + 1;
- return instr->regs_count;
+ cnt++;
+ return cnt;
}
static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
{
- if (n == (instr->regs_count + 0))
+ if (n == (instr->regs_count + instr->deps_count))
return instr->address;
+ if (n >= instr->regs_count)
+ return instr->deps[n - instr->regs_count];
return ssa(instr->regs[n]);
}
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+ if (n == (instr->regs_count + instr->deps_count))
+ return false;
+ if (n >= instr->regs_count)
+ return true;
+ return false;
+}
+
#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
/* iterator for an instruction's SSA sources (instr), also returns src #: */
#define foreach_ssa_src_n(__srcinst, __n, __instr) \
- if ((__instr)->regs_count) \
- for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
- if ((__srcinst = __ssa_src_n(__instr, __n)))
+ for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+ if ((__srcinst = __ssa_src_n(__instr, __n)))
/* iterator for an instruction's SSA sources (instr): */
#define foreach_ssa_src(__srcinst, __instr) \
@@ -950,6 +993,7 @@ void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
void ir3_group(struct ir3 *ir);
/* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
int ir3_sched(struct ir3 *ir);
/* register assignment: */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bd3e0d0cd4a..3fd2e50d82f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -74,20 +74,6 @@ struct ir3_context {
/* Compute shader inputs: */
struct ir3_instruction *local_invocation_id, *work_group_id;
- /* For SSBO's and atomics, we need to preserve order, such
- * that reads don't overtake writes, and the order of writes
- * is preserved. Atomics are considered as a write.
- *
- * To do this, we track last write and last access, in a
- * similar way to ir3_array. But since we don't know whether
- * the same SSBO is bound to multiple slots, so we simply
- * track this globally rather than per-SSBO.
- *
- * TODO should we track this per block instead? I guess it
- * shouldn't matter much?
- */
- struct ir3_instruction *last_write, *last_access;
-
/* mapping from nir_register to defining instruction: */
struct hash_table *def_ht;
@@ -345,6 +331,8 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
mov = ir3_instr_create(block, OPC_MOV);
mov->cat1.src_type = TYPE_U32;
mov->cat1.dst_type = TYPE_U32;
+ mov->barrier_class = IR3_BARRIER_ARRAY_R;
+ mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
ir3_reg_create(mov, 0, 0);
src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
COND(address, IR3_REG_RELATIV));
@@ -356,8 +344,6 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
if (address)
ir3_instr_set_address(mov, address);
- arr->last_access = mov;
-
return mov;
}
@@ -373,9 +359,11 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
mov = ir3_instr_create(block, OPC_MOV);
mov->cat1.src_type = TYPE_U32;
mov->cat1.dst_type = TYPE_U32;
+ mov->barrier_class = IR3_BARRIER_ARRAY_W;
+ mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
COND(address, IR3_REG_RELATIV));
- dst->instr = arr->last_access;
+ dst->instr = arr->last_write;
dst->size = arr->length;
dst->array.id = arr->id;
dst->array.offset = n;
@@ -384,7 +372,7 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
if (address)
ir3_instr_set_address(mov, address);
- arr->last_write = arr->last_access = mov;
+ arr->last_write = mov;
return mov;
}
@@ -1236,22 +1224,6 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
}
}
-static void
-mark_read(struct ir3_context *ctx, struct ir3_instruction *instr)
-{
- instr->regs[0]->instr = ctx->last_write;
- instr->regs[0]->flags |= IR3_REG_SSA;
- ctx->last_access = instr;
-}
-
-static void
-mark_write(struct ir3_context *ctx, struct ir3_instruction *instr)
-{
- instr->regs[0]->instr = ctx->last_access;
- instr->regs[0]->flags |= IR3_REG_SSA;
- ctx->last_write = ctx->last_access = instr;
-}
-
/* src[] = { buffer_index, offset }. No const_index */
static void
emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
@@ -1280,7 +1252,8 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
ldgb->cat6.iim_val = intr->num_components;
ldgb->cat6.d = 4;
ldgb->cat6.type = TYPE_U32;
- mark_read(ctx, ldgb);
+ ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+ ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
split_dest(b, dst, ldgb, 0, intr->num_components);
}
@@ -1320,7 +1293,8 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
stgb->cat6.iim_val = ncomp;
stgb->cat6.d = 4;
stgb->cat6.type = TYPE_U32;
- mark_write(ctx, stgb);
+ stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+ stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
array_insert(b, b->keeps, stgb);
}
@@ -1430,7 +1404,8 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
atomic->cat6.iim_val = 1;
atomic->cat6.d = 4;
atomic->cat6.type = type;
- mark_write(ctx, atomic);
+ atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+ atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
@@ -1455,7 +1430,8 @@ emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
ldl->cat6.type = TYPE_U32;
ldl->regs[0]->wrmask = MASK(intr->num_components);
- mark_read(ctx, ldl);
+ ldl->barrier_class = IR3_BARRIER_SHARED_R;
+ ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
split_dest(b, dst, ldl, 0, intr->num_components);
}
@@ -1491,8 +1467,9 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
create_immed(b, length), 0);
stl->cat6.dst_offset = first_component + base;
stl->cat6.type = TYPE_U32;
+ stl->barrier_class = IR3_BARRIER_SHARED_W;
+ stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
- mark_write(ctx, stl);
array_insert(b, b->keeps, stl);
/* Clear the bits in the writemask that we just wrote, then try
@@ -1573,7 +1550,8 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
atomic->cat6.iim_val = 1;
atomic->cat6.d = 1;
atomic->cat6.type = type;
- mark_write(ctx, atomic);
+ atomic->barrier_class = IR3_BARRIER_SHARED_W;
+ atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
@@ -1702,6 +1680,9 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags,
tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL);
+ sam->barrier_class = IR3_BARRIER_IMAGE_R;
+ sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
split_dest(b, dst, sam, 0, 4);
}
@@ -1737,7 +1718,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
stib->cat6.d = ncoords;
stib->cat6.type = get_image_type(var);
stib->cat6.typed = true;
- mark_write(ctx, stib);
+ stib->barrier_class = IR3_BARRIER_IMAGE_W;
+ stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
array_insert(b, b->keeps, stib);
}
@@ -1821,7 +1803,8 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
atomic->cat6.d = ncoords;
atomic->cat6.type = get_image_type(var);
atomic->cat6.typed = true;
- mark_write(ctx, atomic);
+ atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+ atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
@@ -1841,23 +1824,62 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
barrier->cat7.g = true;
barrier->cat7.l = true;
barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+ barrier->barrier_class = IR3_BARRIER_EVERYTHING;
break;
case nir_intrinsic_memory_barrier:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict =
+ IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+ break;
case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
barrier = ir3_FENCE(b);
barrier->cat7.g = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+ IR3_BARRIER_BUFFER_W;
break;
- case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier_image:
+ // TODO double check if this should have .g set
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+ barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+ IR3_BARRIER_IMAGE_W;
+ break;
case nir_intrinsic_memory_barrier_shared:
barrier = ir3_FENCE(b);
barrier->cat7.g = true;
barrier->cat7.l = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_SHARED_W;
+ barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+ IR3_BARRIER_SHARED_W;
+ break;
+ case nir_intrinsic_group_memory_barrier:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.l = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_SHARED_W |
+ IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict =
+ IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+ IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
break;
default:
unreachable("boo");
@@ -3301,6 +3323,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
ir3_print(ir);
}
+ ir3_sched_add_deps(ir);
+
/* Group left/right neighbors, inserting mov's where needed to
* solve conflicts:
*/
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index be39027b6a0..55ca5333b47 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -55,6 +55,10 @@
int ir3_delayslots(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned n)
{
+ /* don't count false-dependencies: */
+ if (__is_false_dep(consumer, n))
+ return 0;
+
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
* handled with sync bits
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index b56da304f92..9492e9ba650 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -669,3 +669,94 @@ int ir3_sched(struct ir3 *ir)
return -1;
return 0;
}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+ /* TODO for dependencies that are related to a specific object, ie
+ * a specific SSBO/image/array, we could relax this constraint to
+ * make accesses to unrelated objects not depend on each other (at
+ * least as long as not declared coherent)
+ */
+ if ((instr->barrier_class & IR3_BARRIER_EVERYTHING) ||
+ (prior->barrier_class & IR3_BARRIER_EVERYTHING))
+ return true;
+ return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ struct list_head *prev = instr->node.prev;
+ struct list_head *next = instr->node.next;
+
+ /* add dependencies on previous instructions that must be scheduled
+ * prior to the current instruction
+ */
+ while (prev != &block->instr_list) {
+ struct ir3_instruction *pi =
+ LIST_ENTRY(struct ir3_instruction, prev, node);
+
+ prev = prev->prev;
+
+ if (is_meta(pi))
+ continue;
+
+ if (instr->barrier_class == pi->barrier_class) {
+ ir3_instr_add_dep(instr, pi);
+ break;
+ }
+
+ if (depends_on(instr, pi))
+ ir3_instr_add_dep(instr, pi);
+ }
+
+ /* add dependencies on this instruction to following instructions
+ * that must be scheduled after the current instruction:
+ */
+ while (next != &block->instr_list) {
+ struct ir3_instruction *ni =
+ LIST_ENTRY(struct ir3_instruction, next, node);
+
+ next = next->next;
+
+ if (is_meta(ni))
+ continue;
+
+ if (instr->barrier_class == ni->barrier_class) {
+ ir3_instr_add_dep(ni, instr);
+ break;
+ }
+
+ if (depends_on(ni, instr))
+ ir3_instr_add_dep(ni, instr);
+ }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ * (1) barriers are scheduled in the right order wrt instructions related
+ * to the barrier
+ *
+ * (2) reads that come before a write actually get scheduled before the
+ * write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (instr->barrier_class) {
+ add_barrier_deps(block, instr);
+ }
+ }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ calculate_deps(block);
+ }
+}