From c8b0f904398cdc30ffc67c162bc3f570bf887ed9 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 6 Mar 2020 18:06:06 +0100 Subject: ir3: Add bindless instruction encoding Part-of: --- src/freedreno/ir3/disasm-a3xx.c | 187 +++++++++++++++++++++++++++++----------- src/freedreno/ir3/instr-a3xx.h | 148 ++++++++++++++++++++++++------- src/freedreno/ir3/ir3.c | 45 +++++----- 3 files changed, 277 insertions(+), 103 deletions(-) (limited to 'src/freedreno') diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index cd45e912703..d957e77f853 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -117,7 +117,10 @@ static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r, else fprintf(ctx->out, "%s%c", full ? "" : "h", type); } else if ((reg.num == REG_A0) && !c) { - fprintf(ctx->out, "a0.%c", component[reg.comp]); + /* This matches libllvm output, the second (scalar) address register + * seems to be called a1.x instead of a0.y. + */ + fprintf(ctx->out, "a%d.x", reg.comp); } else if ((reg.num == REG_P0) && !c) { fprintf(ctx->out, "p0.%c", component[reg.comp]); } else { @@ -448,15 +451,70 @@ static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr) [opc_op(OPC_RGETPOS)] = { true, false, false, false, }, [opc_op(OPC_RGETINFO)] = { false, false, false, false, }, }; + + static const struct { + bool indirect; + bool bindless; + bool use_a1; + bool uniform; + } desc_features[8] = { + [CAT5_NONUNIFORM] = { .indirect = true, }, + [CAT5_UNIFORM] = { .indirect = true, .uniform = true, }, + [CAT5_BINDLESS_IMM] = { .bindless = true, }, + [CAT5_BINDLESS_UNIFORM] = { + .bindless = true, + .indirect = true, + .uniform = true, + }, + [CAT5_BINDLESS_NONUNIFORM] = { + .bindless = true, + .indirect = true, + }, + [CAT5_BINDLESS_A1_IMM] = { + .bindless = true, + .use_a1 = true, + }, + [CAT5_BINDLESS_A1_UNIFORM] = { + .bindless = true, + .indirect = true, + .uniform = true, + .use_a1 = true, + }, + [CAT5_BINDLESS_A1_NONUNIFORM] = { + .bindless = true, + .indirect = true, + .use_a1 = true, + }, + }; + instr_cat5_t *cat5 = &instr->cat5; int i; + bool desc_indirect = + cat5->is_s2en_bindless && + desc_features[cat5->s2en_bindless.desc_mode].indirect; + bool bindless = + cat5->is_s2en_bindless && + desc_features[cat5->s2en_bindless.desc_mode].bindless; + bool use_a1 = + cat5->is_s2en_bindless && + desc_features[cat5->s2en_bindless.desc_mode].use_a1; + bool uniform = + cat5->is_s2en_bindless && + desc_features[cat5->s2en_bindless.desc_mode].uniform; + if (cat5->is_3d) fprintf(ctx->out, ".3d"); if (cat5->is_a) fprintf(ctx->out, ".a"); if (cat5->is_o) fprintf(ctx->out, ".o"); if (cat5->is_p) fprintf(ctx->out, ".p"); if (cat5->is_s) fprintf(ctx->out, ".s"); - if (cat5->is_s2en) fprintf(ctx->out, ".s2en"); + if (desc_indirect) fprintf(ctx->out, ".s2en"); + if (uniform) fprintf(ctx->out, ".uniform"); + + if (bindless) { + unsigned base = (cat5->s2en_bindless.base_hi << 1) | cat5->base_lo; + fprintf(ctx->out, ".base%d", base); + } fprintf(ctx->out, " "); @@ -483,34 +541,47 @@ static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr) false, false, false); } - if (cat5->is_s2en) { - if (cat5->is_o || info[cat5->opc].src2) { - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, - false, false, false, false, false, false); - } + if (cat5->is_o || info[cat5->opc].src2) { fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false, - false, false, false); - } else { - if (cat5->is_o || info[cat5->opc].src2) { - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full, - false, false, false, false, false, false); + print_reg_src(ctx, (reg_t)(cat5->src2), cat5->full, + false, false, false, false, false, false); + } + if (cat5->is_s2en_bindless) { + if (!desc_indirect) { + if (info[cat5->opc].samp) { + if (use_a1) + fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3); + else + fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3 & 0xf); + } + + if (info[cat5->opc].tex && !use_a1) { + fprintf(ctx->out, ", t#%d", cat5->s2en_bindless.src3 >> 4); + } } + } else { if (info[cat5->opc].samp) fprintf(ctx->out, ", s#%d", cat5->norm.samp); if (info[cat5->opc].tex) fprintf(ctx->out, ", t#%d", cat5->norm.tex); } + if (desc_indirect) { + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)(cat5->s2en_bindless.src3), bindless, + false, false, false, false, false, false); + } + + if (use_a1) + fprintf(ctx->out, ", a1.x"); + if (debug & PRINT_VERBOSE) { - if (cat5->is_s2en) { - if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) - fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); + if (cat5->is_s2en_bindless) { + if ((debug & PRINT_VERBOSE) && cat5->s2en_bindless.dummy1) + fprintf(ctx->out, "\t{5: %x}", cat5->s2en_bindless.dummy1); } else { - if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) - fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); + if ((debug & PRINT_VERBOSE) && cat5->norm.dummy1) + fprintf(ctx->out, "\t{5: %x}", cat5->norm.dummy1); } } } @@ -833,46 +904,66 @@ static void print_instr_cat6_a3xx(struct disasm_ctx *ctx, instr_t *instr) static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr) { instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx; - struct reginfo src1, src2; - bool has_dest = _OPC(6, cat6->opc) == OPC_LDIB; - char ss = 0; + struct reginfo src1, src2, ssbo; + bool uses_type = _OPC(6, cat6->opc) != OPC_LDC; - memset(&src1, 0, sizeof(src1)); - memset(&src2, 0, sizeof(src2)); + static const struct { + bool indirect; + bool bindless; + bool uniform; + } desc_features[8] = { + [CAT6_IMM] = { false }, + [CAT6_BINDLESS_IMM] = { .bindless = true, }, + [CAT6_BINDLESS_UNIFORM] = { + .bindless = true, + .indirect = true, + .uniform = true, + }, + [CAT6_BINDLESS_NONUNIFORM] = { + .bindless = true, + .indirect = true, + }, + }; - fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped"); - fprintf(ctx->out, ".%dd", cat6->d + 1); - fprintf(ctx->out, ".%s", type[cat6->type]); - fprintf(ctx->out, ".%u ", cat6->type_size + 1); + bool indirect_ssbo = desc_features[cat6->desc_mode].indirect; + bool bindless = desc_features[cat6->desc_mode].bindless; + bool uniform = desc_features[cat6->desc_mode].uniform; - if (has_dest) { - src2.reg = (reg_t)(cat6->src2); - src2.full = true; // XXX - print_src(ctx, &src2); - fprintf(ctx->out, ", "); + memset(&src1, 0, sizeof(src1)); + memset(&src2, 0, sizeof(src2)); + memset(&ssbo, 0, sizeof(ssbo)); + + if (uses_type) { + fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped"); + fprintf(ctx->out, ".%dd", cat6->d + 1); + fprintf(ctx->out, ".%s", type[cat6->type]); } + fprintf(ctx->out, ".%u", cat6->type_size + 1); - /* NOTE: blob seems to use old encoding for ldl/stl (local memory) */ - ss = 'g'; + if (bindless) + fprintf(ctx->out, ".base%d", cat6->base); + if (uniform) + fprintf(ctx->out, ".uniform"); + fprintf(ctx->out, " "); + + src2.reg = (reg_t)(cat6->src2); + src2.full = true; // XXX + print_src(ctx, &src2); + fprintf(ctx->out, ", "); - fprintf(ctx->out, "%c[%u", ss, cat6->ssbo); - fprintf(ctx->out, "] + "); src1.reg = (reg_t)(cat6->src1); src1.full = true; // XXX print_src(ctx, &src1); - - if (!has_dest) { - fprintf(ctx->out, ", "); - - src2.reg = (reg_t)(cat6->src2); - src2.full = true; // XXX - print_src(ctx, &src2); - } + fprintf(ctx->out, ", "); + ssbo.reg = (reg_t)(cat6->ssbo); + ssbo.im = !indirect_ssbo; + ssbo.full = true; + print_src(ctx, &ssbo); if (debug & PRINT_VERBOSE) { - fprintf(ctx->out, " (pad1=%x, pad2=%x, pad3=%x, pad4=%x)", cat6->pad1, - cat6->pad2, cat6->pad3, cat6->pad4); + fprintf(ctx->out, " (pad1=%x, pad2=%x, pad3=%x, pad4=%x, pad5=%x)", + cat6->pad1, cat6->pad2, cat6->pad3, cat6->pad4, cat6->pad5); } } diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index b3649f24bdf..f36c73b88e2 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -567,6 +567,57 @@ typedef struct PACKED { uint32_t opc_cat : 3; } instr_cat4_t; +/* With is_bindless_s2en = 1, this determines whether bindless is enabled and + * if so, how to get the (base, index) pair for both sampler and texture. + * There is a single base embedded in the instruction, which is always used + * for the texture. + */ +typedef enum { + /* Use traditional GL binding model, get texture and sampler index + * from src3 which is not presumed to be uniform. This is + * backwards-compatible with earlier generations, where this field was + * always 0 and nonuniform-indexed sampling always worked. + */ + CAT5_NONUNIFORM = 0, + + /* The sampler base comes from the low 3 bits of a1.x, and the sampler + * and texture index come from src3 which is presumed to be uniform. + */ + CAT5_BINDLESS_A1_UNIFORM = 1, + + /* The texture and sampler share the same base, and the sampler and + * texture index come from src3 which is *not* presumed to be uniform. + */ + CAT5_BINDLESS_NONUNIFORM = 2, + + /* The sampler base comes from the low 3 bits of a1.x, and the sampler + * and texture index come from src3 which is *not* presumed to be + * uniform. + */ + CAT5_BINDLESS_A1_NONUNIFORM = 3, + + /* Use traditional GL binding model, get texture and sampler index + * from src3 which is presumed to be uniform. + */ + CAT5_UNIFORM = 4, + + /* The texture and sampler share the same base, and the sampler and + * texture index come from src3 which is presumed to be uniform. + */ + CAT5_BINDLESS_UNIFORM = 5, + + /* The texture and sampler share the same base, get sampler index from low + * 4 bits of src3 and texture index from high 4 bits. + */ + CAT5_BINDLESS_IMM = 6, + + /* The sampler base comes from the low 3 bits of a1.x, and the texture + * index comes from the next 8 bits of a1.x. The sampler index is an + * immediate in src3. + */ + CAT5_BINDLESS_A1_IMM = 7, +} cat5_desc_mode_t; + typedef struct PACKED { /* dword0: */ union PACKED { @@ -581,39 +632,41 @@ typedef struct PACKED { } norm; /* s2en case: */ struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t src2 : 11; - uint32_t dummy1 : 1; - uint32_t src3 : 8; - uint32_t dummy2 : 3; - } s2en; + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 8; + uint32_t dummy1 : 2; + uint32_t base_hi : 2; + uint32_t src3 : 8; + uint32_t desc_mode : 3; + } s2en_bindless; /* same in either case: */ // XXX I think, confirm this struct PACKED { uint32_t full : 1; /* not half */ uint32_t src1 : 8; - uint32_t pad : 23; + uint32_t src2 : 8; + uint32_t pad : 15; }; }; /* dword1: */ - uint32_t dst : 8; - uint32_t wrmask : 4; /* write-mask */ - uint32_t type : 3; - uint32_t dummy2 : 1; /* seems to be ignored */ - uint32_t is_3d : 1; - - uint32_t is_a : 1; - uint32_t is_s : 1; - uint32_t is_s2en : 1; - uint32_t is_o : 1; - uint32_t is_p : 1; - - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; + uint32_t dst : 8; + uint32_t wrmask : 4; /* write-mask */ + uint32_t type : 3; + uint32_t base_lo : 1; /* used with bindless */ + uint32_t is_3d : 1; + + uint32_t is_a : 1; + uint32_t is_s : 1; + uint32_t is_s2en_bindless : 1; + uint32_t is_o : 1; + uint32_t is_p : 1; + + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; } instr_cat5_t; /* dword0 encoding for src_off: [src1 + off], src2: */ @@ -748,43 +801,72 @@ typedef union PACKED { }; } instr_cat6_t; +/* Similar to cat5_desc_mode_t, describes how the descriptor is loaded. + */ +typedef enum { + /* Use old GL binding model with an immediate index. + * TODO: find CAT6_UNIFORM and CAT6_NONUNIFORM + */ + CAT6_IMM = 0, + + /* Use the bindless model, with an immediate index. + */ + CAT6_BINDLESS_IMM = 4, + + /* Use the bindless model, with a uniform register index. + */ + CAT6_BINDLESS_UNIFORM = 5, + + /* Use the bindless model, with a register index that isn't guaranteed + * to be uniform. This presumably checks if the indices are equal and + * splits up the load/store, because it works the way you would + * expect. + */ + CAT6_BINDLESS_NONUNIFORM = 6, +} cat6_desc_mode_t; + /** * For atomic ops (which return a value): * - * pad1=1, pad2=c, pad3=0, pad4=3 + * pad1=1, pad3=c, pad5=3 * src1 - vecN offset/coords * src2.x - is actually dest register * src2.y - is 'data' except for cmpxchg where src2.y is 'compare' * and src2.z is 'data' * * For stib (which does not return a value): - * pad1=0, pad2=c, pad3=0, pad4=2 + * pad1=0, pad3=c, pad5=2 * src1 - vecN offset/coords * src2 - value to store * * For ldib: - * pad1=1, pad2=c, pad3=0, pad4=2 + * pad1=1, pad3=c, pad5=2 * src1 - vecN offset/coords * * for ldc (load from UBO using descriptor): - * pad1=0, pad2=8, pad3=0, pad4=2 + * pad1=0, pad3=8, pad5=2 + * + * pad2 and pad5 are only observed to be 0. */ typedef struct PACKED { /* dword0: */ - uint32_t pad1 : 9; + uint32_t pad1 : 1; + uint32_t base : 3; + uint32_t pad2 : 2; + uint32_t desc_mode : 3; uint32_t d : 2; uint32_t typed : 1; uint32_t type_size : 2; uint32_t opc : 5; - uint32_t pad2 : 5; + uint32_t pad3 : 5; uint32_t src1 : 8; /* coordinate/offset */ /* dword1: */ uint32_t src2 : 8; /* or the dst for load instructions */ - uint32_t pad3 : 1; //mustbe0 ?? or zero means imm vs reg for ssbo?? + uint32_t pad4 : 1; //mustbe0 ?? uint32_t ssbo : 8; /* ssbo/image binding point */ uint32_t type : 3; - uint32_t pad4 : 7; + uint32_t pad5 : 7; uint32_t jmp_tgt : 1; uint32_t sync : 1; uint32_t opc_cat : 3; @@ -869,7 +951,7 @@ static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id) * cmdstream traces I have indicates that the pad bit is zero * in all cases. So we can use this to detect new encoding: */ - if ((cat6->pad2 & 0x8) && (cat6->pad4 & 0x2)) { + if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) { assert(gpu_id >= 600); assert(instr->cat6.opc == 0); return false; diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 4ac50aec0a3..7bdf8a39ba8 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -482,20 +482,23 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr, cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); } + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + if (instr->flags & IR3_INSTR_S2EN) { struct ir3_register *samp_tex = instr->regs[1]; - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } iassert(samp_tex->flags & IR3_REG_HALF); - cat5->s2en.src3 = reg(samp_tex, info, instr->repeat, IR3_REG_HALF); + cat5->s2en_bindless.src3 = reg(samp_tex, info, instr->repeat, IR3_REG_HALF); + /* TODO: This should probably be CAT5_UNIFORM, at least on a6xx, as + * this is what the blob does and it is presumably faster, but first + * we should confirm it is actually nonuniform and figure out when the + * whole descriptor mode mechanism was introduced. + */ + cat5->s2en_bindless.desc_mode = CAT5_NONUNIFORM; iassert(!(instr->cat5.samp | instr->cat5.tex)); } else { - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } cat5->norm.samp = instr->cat5.samp; cat5->norm.tex = instr->cat5.tex; } @@ -506,7 +509,7 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr, cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); cat5->is_a = !!(instr->flags & IR3_INSTR_A); cat5->is_s = !!(instr->flags & IR3_INSTR_S); - cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); + cat5->is_s2en_bindless = !!(instr->flags & IR3_INSTR_S2EN); cat5->is_o = !!(instr->flags & IR3_INSTR_O); cat5->is_p = !!(instr->flags & IR3_INSTR_P); cat5->opc = instr->opc; @@ -564,31 +567,29 @@ static int emit_cat6_a6xx(struct ir3_instruction *instr, void *ptr, case OPC_ATOMIC_OR: case OPC_ATOMIC_XOR: cat6->pad1 = 0x1; - cat6->pad2 = 0xc; - cat6->pad3 = 0x0; - cat6->pad4 = 0x3; + cat6->pad3 = 0xc; + cat6->pad5 = 0x3; break; case OPC_STIB: cat6->pad1 = 0x0; - cat6->pad2 = 0xc; - cat6->pad3 = 0x0; - cat6->pad4 = 0x2; + cat6->pad3 = 0xc; + cat6->pad5 = 0x2; break; case OPC_LDIB: cat6->pad1 = 0x1; - cat6->pad2 = 0xc; - cat6->pad3 = 0x0; - cat6->pad4 = 0x2; + cat6->pad3 = 0xc; + cat6->pad5 = 0x2; break; case OPC_LDC: cat6->pad1 = 0x0; - cat6->pad2 = 0x8; - cat6->pad3 = 0x0; - cat6->pad4 = 0x2; + cat6->pad3 = 0x8; + cat6->pad5 = 0x2; break; default: iassert(0); } + cat6->pad2 = 0x0; + cat6->pad4 = 0x0; return 0; } -- cgit v1.2.3