diff options
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.h | 74 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 110 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_depth.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_sched.c | 91 |
5 files changed, 227 insertions, 58 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index 6db0a2a20cd..01a7bbc7dc6 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -817,6 +817,12 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) return new_instr; } +/* Add a false dependency to instruction, to ensure it is scheduled first: */ +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) +{ + array_insert(instr, instr->deps, dep); +} + struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 90f8e3c44d3..6ef0683ab00 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -326,6 +326,40 @@ struct ir3_instruction { */ struct ir3_instruction *address; + /* Tracking for additional dependent instructions. Used to handle + * barriers, WAR hazards for arrays/SSBOs/etc. + */ + DECLARE_ARRAY(struct ir3_instruction *, deps); + + /* + * From PoV of instruction scheduling, not execution (ie. ignores global/ + * local distinction): + * shared image atomic SSBO everything + * barrier()/ - R/W R/W R/W R/W X + * groupMemoryBarrier() + * memoryBarrier() - R/W R/W + * (but only images declared coherent?) + * memoryBarrierAtomic() - R/W + * memoryBarrierBuffer() - R/W + * memoryBarrierImage() - R/W + * memoryBarrierShared() - R/W + * + * TODO I think for SSBO/image/shared, in cases where we can determine + * which variable is accessed, we don't need to care about accesses to + * different variables (unless declared coherent??) + */ + enum { + IR3_BARRIER_EVERYTHING = 1 << 0, + IR3_BARRIER_SHARED_R = 1 << 1, + IR3_BARRIER_SHARED_W = 1 << 2, + IR3_BARRIER_IMAGE_R = 1 << 3, + IR3_BARRIER_IMAGE_W = 1 << 4, + IR3_BARRIER_BUFFER_R = 1 << 5, + IR3_BARRIER_BUFFER_W = 1 << 6, + IR3_BARRIER_ARRAY_R = 1 << 7, + IR3_BARRIER_ARRAY_W = 1 << 8, + } barrier_class, barrier_conflict; + /* Entry in ir3_block's instruction list: */ struct list_head node; @@ -417,16 +451,13 @@ struct ir3_array { nir_register *r; - /* We track the last write and last access (read or write) to - * setup dependencies on instructions that read or write the - * array. Reads can be re-ordered wrt. other reads, but should - * not be re-ordered wrt. to writes. Writes cannot be reordered - * wrt. any other access to the array. - * - * So array reads depend on last write, and array writes depend - * on the last access. + /* To avoid array write's from getting DCE'd, keep track of the + * most recent write. Any array access depends on the most + * recent write. This way, nothing depends on writes after the + * last read. But all the writes that happen before that have + * something depending on them */ - struct ir3_instruction *last_write, *last_access; + struct ir3_instruction *last_write; /* extra stuff used in RA pass: */ unsigned base; /* base vreg name */ @@ -493,6 +524,7 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc); struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, opc_t opc, int nreg); struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep); const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, @@ -907,25 +939,36 @@ static inline unsigned ir3_cat3_absneg(opc_t opc) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { + unsigned cnt = instr->regs_count + instr->deps_count; if (instr->address) - return instr->regs_count + 1; - return instr->regs_count; + cnt++; + return cnt; } static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { - if (n == (instr->regs_count + 0)) + if (n == (instr->regs_count + instr->deps_count)) return instr->address; + if (n >= instr->regs_count) + return instr->deps[n - instr->regs_count]; return ssa(instr->regs[n]); } +static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) +{ + if (n == (instr->regs_count + instr->deps_count)) + return false; + if (n >= instr->regs_count) + return true; + return false; +} + #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1) /* iterator for an instruction's SSA sources (instr), also returns src #: */ #define foreach_ssa_src_n(__srcinst, __n, __instr) \ - if ((__instr)->regs_count) \ - for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n))) + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) /* iterator for an instruction's SSA sources (instr): */ #define foreach_ssa_src(__srcinst, __instr) \ @@ -950,6 +993,7 @@ void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); void ir3_group(struct ir3 *ir); /* scheduling: */ +void ir3_sched_add_deps(struct ir3 *ir); int ir3_sched(struct ir3 *ir); /* register assignment: */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index bd3e0d0cd4a..3fd2e50d82f 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -74,20 +74,6 @@ struct ir3_context { /* Compute shader inputs: */ struct ir3_instruction *local_invocation_id, *work_group_id; - /* For SSBO's and atomics, we need to preserve order, such - * that reads don't overtake writes, and the order of writes - * is preserved. Atomics are considered as a write. - * - * To do this, we track last write and last access, in a - * similar way to ir3_array. But since we don't know whether - * the same SSBO is bound to multiple slots, so we simply - * track this globally rather than per-SSBO. - * - * TODO should we track this per block instead? I guess it - * shouldn't matter much? - */ - struct ir3_instruction *last_write, *last_access; - /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; @@ -345,6 +331,8 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_R; + mov->barrier_conflict = IR3_BARRIER_ARRAY_W; ir3_reg_create(mov, 0, 0); src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV)); @@ -356,8 +344,6 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, if (address) ir3_instr_set_address(mov, address); - arr->last_access = mov; - return mov; } @@ -373,9 +359,11 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_W; + mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV)); - dst->instr = arr->last_access; + dst->instr = arr->last_write; dst->size = arr->length; dst->array.id = arr->id; dst->array.offset = n; @@ -384,7 +372,7 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, if (address) ir3_instr_set_address(mov, address); - arr->last_write = arr->last_access = mov; + arr->last_write = mov; return mov; } @@ -1236,22 +1224,6 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, } } -static void -mark_read(struct ir3_context *ctx, struct ir3_instruction *instr) -{ - instr->regs[0]->instr = ctx->last_write; - instr->regs[0]->flags |= IR3_REG_SSA; - ctx->last_access = instr; -} - -static void -mark_write(struct ir3_context *ctx, struct ir3_instruction *instr) -{ - instr->regs[0]->instr = ctx->last_access; - instr->regs[0]->flags |= IR3_REG_SSA; - ctx->last_write = ctx->last_access = instr; -} - /* src[] = { buffer_index, offset }. No const_index */ static void emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -1280,7 +1252,8 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, ldgb->cat6.iim_val = intr->num_components; ldgb->cat6.d = 4; ldgb->cat6.type = TYPE_U32; - mark_read(ctx, ldgb); + ldgb->barrier_class = IR3_BARRIER_BUFFER_R; + ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W; split_dest(b, dst, ldgb, 0, intr->num_components); } @@ -1320,7 +1293,8 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) stgb->cat6.iim_val = ncomp; stgb->cat6.d = 4; stgb->cat6.type = TYPE_U32; - mark_write(ctx, stgb); + stgb->barrier_class = IR3_BARRIER_BUFFER_W; + stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; array_insert(b, b->keeps, stgb); } @@ -1430,7 +1404,8 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.iim_val = 1; atomic->cat6.d = 4; atomic->cat6.type = type; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_BUFFER_W; + atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1455,7 +1430,8 @@ emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, ldl->cat6.type = TYPE_U32; ldl->regs[0]->wrmask = MASK(intr->num_components); - mark_read(ctx, ldl); + ldl->barrier_class = IR3_BARRIER_SHARED_R; + ldl->barrier_conflict = IR3_BARRIER_SHARED_W; split_dest(b, dst, ldl, 0, intr->num_components); } @@ -1491,8 +1467,9 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) create_immed(b, length), 0); stl->cat6.dst_offset = first_component + base; stl->cat6.type = TYPE_U32; + stl->barrier_class = IR3_BARRIER_SHARED_W; + stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; - mark_write(ctx, stl); array_insert(b, b->keeps, stl); /* Clear the bits in the writemask that we just wrote, then try @@ -1573,7 +1550,8 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.iim_val = 1; atomic->cat6.d = 1; atomic->cat6.type = type; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_SHARED_W; + atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1702,6 +1680,9 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags, tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL); + sam->barrier_class = IR3_BARRIER_IMAGE_R; + sam->barrier_conflict = IR3_BARRIER_IMAGE_W; + split_dest(b, dst, sam, 0, 4); } @@ -1737,7 +1718,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) stib->cat6.d = ncoords; stib->cat6.type = get_image_type(var); stib->cat6.typed = true; - mark_write(ctx, stib); + stib->barrier_class = IR3_BARRIER_IMAGE_W; + stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; array_insert(b, b->keeps, stib); } @@ -1821,7 +1803,8 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.d = ncoords; atomic->cat6.type = get_image_type(var); atomic->cat6.typed = true; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_IMAGE_W; + atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1841,23 +1824,62 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) barrier->cat7.g = true; barrier->cat7.l = true; barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; + barrier->barrier_class = IR3_BARRIER_EVERYTHING; break; case nir_intrinsic_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + break; case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: barrier = ir3_FENCE(b); barrier->cat7.g = true; barrier->cat7.r = true; barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | + IR3_BARRIER_BUFFER_W; break; - case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_image: + // TODO double check if this should have .g set + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W; + barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | + IR3_BARRIER_IMAGE_W; + break; case nir_intrinsic_memory_barrier_shared: barrier = ir3_FENCE(b); barrier->cat7.g = true; barrier->cat7.l = true; barrier->cat7.r = true; barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W; + barrier->barrier_conflict = IR3_BARRIER_SHARED_R | + IR3_BARRIER_SHARED_W; + break; + case nir_intrinsic_group_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.l = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; break; default: unreachable("boo"); @@ -3301,6 +3323,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } + ir3_sched_add_deps(ir); + /* Group left/right neighbors, inserting mov's where needed to * solve conflicts: */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index be39027b6a0..55ca5333b47 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -55,6 +55,10 @@ int ir3_delayslots(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n) { + /* don't count false-dependencies: */ + if (__is_false_dep(consumer, n)) + return 0; + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch * handled with sync bits diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index b56da304f92..9492e9ba650 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -669,3 +669,94 @@ int ir3_sched(struct ir3 *ir) return -1; return 0; } + +/* does instruction 'prior' need to be scheduled before 'instr'? */ +static bool +depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) +{ + /* TODO for dependencies that are related to a specific object, ie + * a specific SSBO/image/array, we could relax this constraint to + * make accesses to unrelated objects not depend on each other (at + * least as long as not declared coherent) + */ + if ((instr->barrier_class & IR3_BARRIER_EVERYTHING) || + (prior->barrier_class & IR3_BARRIER_EVERYTHING)) + return true; + return !!(instr->barrier_class & prior->barrier_conflict); +} + +static void +add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) +{ + struct list_head *prev = instr->node.prev; + struct list_head *next = instr->node.next; + + /* add dependencies on previous instructions that must be scheduled + * prior to the current instruction + */ + while (prev != &block->instr_list) { + struct ir3_instruction *pi = + LIST_ENTRY(struct ir3_instruction, prev, node); + + prev = prev->prev; + + if (is_meta(pi)) + continue; + + if (instr->barrier_class == pi->barrier_class) { + ir3_instr_add_dep(instr, pi); + break; + } + + if (depends_on(instr, pi)) + ir3_instr_add_dep(instr, pi); + } + + /* add dependencies on this instruction to following instructions + * that must be scheduled after the current instruction: + */ + while (next != &block->instr_list) { + struct ir3_instruction *ni = + LIST_ENTRY(struct ir3_instruction, next, node); + + next = next->next; + + if (is_meta(ni)) + continue; + + if (instr->barrier_class == ni->barrier_class) { + ir3_instr_add_dep(ni, instr); + break; + } + + if (depends_on(ni, instr)) + ir3_instr_add_dep(ni, instr); + } +} + +/* before scheduling a block, we need to add any necessary false-dependencies + * to ensure that: + * + * (1) barriers are scheduled in the right order wrt instructions related + * to the barrier + * + * (2) reads that come before a write actually get scheduled before the + * write + */ +static void +calculate_deps(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (instr->barrier_class) { + add_barrier_deps(block, instr); + } + } +} + +void +ir3_sched_add_deps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + calculate_deps(block); + } +} |